1 | /* Copyright (C) 1996-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published |
6 | by the Free Software Foundation; version 2 of the License, or |
7 | (at your option) any later version. |
8 | |
9 | This program is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License |
15 | along with this program; if not, see <https://www.gnu.org/licenses/>. */ |
16 | |
17 | #ifdef HAVE_CONFIG_H |
18 | # include <config.h> |
19 | #endif |
20 | |
21 | #include <assert.h> |
22 | #include <ctype.h> |
23 | #include <errno.h> |
24 | #include <libintl.h> |
25 | #include <stdarg.h> |
26 | #include <stdlib.h> |
27 | #include <string.h> |
28 | #include <stdint.h> |
29 | |
30 | #include "localedef.h" |
31 | #include "charmap.h" |
32 | #include "error.h" |
33 | #include "linereader.h" |
34 | #include "locfile.h" |
35 | |
36 | /* Prototypes for local functions. */ |
37 | static struct token *get_toplvl_escape (struct linereader *lr); |
38 | static struct token *get_symname (struct linereader *lr); |
39 | static struct token *get_ident (struct linereader *lr); |
40 | static struct token *get_string (struct linereader *lr, |
41 | const struct charmap_t *charmap, |
42 | struct localedef_t *locale, |
43 | const struct repertoire_t *repertoire, |
44 | int verbose); |
45 | |
46 | |
47 | struct linereader * |
48 | lr_open (const char *fname, kw_hash_fct_t hf) |
49 | { |
50 | FILE *fp; |
51 | |
52 | if (fname == NULL || strcmp (fname, "-" ) == 0 |
53 | || strcmp (fname, "/dev/stdin" ) == 0) |
54 | return lr_create (stdin, "<stdin>" , hf); |
55 | else |
56 | { |
57 | fp = fopen (fname, "rm" ); |
58 | if (fp == NULL) |
59 | return NULL; |
60 | return lr_create (fp, fname, hf); |
61 | } |
62 | } |
63 | |
64 | struct linereader * |
65 | lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) |
66 | { |
67 | struct linereader *result; |
68 | int n; |
69 | |
70 | result = (struct linereader *) xmalloc (sizeof (*result)); |
71 | |
72 | result->fp = fp; |
73 | result->fname = xstrdup (fname); |
74 | result->buf = NULL; |
75 | result->bufsize = 0; |
76 | result->lineno = 1; |
77 | result->idx = 0; |
78 | result->comment_char = '#'; |
79 | result->escape_char = '\\'; |
80 | result->translate_strings = 1; |
81 | result->return_widestr = 0; |
82 | |
83 | n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); |
84 | if (n < 0) |
85 | { |
86 | int save = errno; |
87 | fclose (result->fp); |
88 | free ((char *) result->fname); |
89 | free (result); |
90 | errno = save; |
91 | return NULL; |
92 | } |
93 | |
94 | if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') |
95 | n -= 2; |
96 | |
97 | result->buf[n] = '\0'; |
98 | result->bufact = n; |
99 | result->hash_fct = hf; |
100 | |
101 | return result; |
102 | } |
103 | |
104 | |
105 | int |
106 | lr_eof (struct linereader *lr) |
107 | { |
108 | return lr->bufact = 0; |
109 | } |
110 | |
111 | |
112 | void |
113 | lr_ignore_rest (struct linereader *lr, int verbose) |
114 | { |
115 | if (verbose) |
116 | { |
117 | while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n' |
118 | && lr->buf[lr->idx] != lr->comment_char) |
119 | if (lr->buf[lr->idx] == '\0') |
120 | { |
121 | if (lr_next (lr) < 0) |
122 | return; |
123 | } |
124 | else |
125 | ++lr->idx; |
126 | |
127 | if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp) |
128 | && lr->buf[lr->idx] != lr->comment_char) |
129 | lr_error (lr, _("trailing garbage at end of line" )); |
130 | } |
131 | |
132 | /* Ignore continued line. */ |
133 | while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n') |
134 | if (lr_next (lr) < 0) |
135 | break; |
136 | |
137 | lr->idx = lr->bufact; |
138 | } |
139 | |
140 | |
141 | void |
142 | lr_close (struct linereader *lr) |
143 | { |
144 | fclose (lr->fp); |
145 | free (lr->buf); |
146 | free (lr); |
147 | } |
148 | |
149 | |
150 | int |
151 | lr_next (struct linereader *lr) |
152 | { |
153 | int n; |
154 | |
155 | n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); |
156 | if (n < 0) |
157 | return -1; |
158 | |
159 | ++lr->lineno; |
160 | |
161 | if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') |
162 | { |
163 | #if 0 |
164 | /* XXX Is this correct? */ |
165 | /* An escaped newline character is substituted with a single <SP>. */ |
166 | --n; |
167 | lr->buf[n - 1] = ' '; |
168 | #else |
169 | n -= 2; |
170 | #endif |
171 | } |
172 | |
173 | lr->buf[n] = '\0'; |
174 | lr->bufact = n; |
175 | lr->idx = 0; |
176 | |
177 | return 0; |
178 | } |
179 | |
180 | |
181 | /* Defined in error.c. */ |
182 | /* This variable is incremented each time `error' is called. */ |
183 | extern unsigned int error_message_count; |
184 | |
185 | /* The calling program should define program_name and set it to the |
186 | name of the executing program. */ |
187 | extern char *program_name; |
188 | |
189 | |
190 | struct token * |
191 | lr_token (struct linereader *lr, const struct charmap_t *charmap, |
192 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
193 | int verbose) |
194 | { |
195 | int ch; |
196 | |
197 | while (1) |
198 | { |
199 | do |
200 | { |
201 | ch = lr_getc (lr); |
202 | |
203 | if (ch == EOF) |
204 | { |
205 | lr->token.tok = tok_eof; |
206 | return &lr->token; |
207 | }; |
208 | |
209 | if (ch == '\n') |
210 | { |
211 | lr->token.tok = tok_eol; |
212 | return &lr->token; |
213 | } |
214 | } |
215 | while (isspace (ch)); |
216 | |
217 | if (ch != lr->comment_char) |
218 | break; |
219 | |
220 | /* Is there an newline at the end of the buffer? */ |
221 | if (lr->buf[lr->bufact - 1] != '\n') |
222 | { |
223 | /* No. Some people want this to mean that only the line in |
224 | the file not the logical, concatenated line is ignored. |
225 | Let's try this. */ |
226 | lr->idx = lr->bufact; |
227 | continue; |
228 | } |
229 | |
230 | /* Ignore rest of line. */ |
231 | lr_ignore_rest (lr, 0); |
232 | lr->token.tok = tok_eol; |
233 | return &lr->token; |
234 | } |
235 | |
236 | /* Match escape sequences. */ |
237 | if (ch == lr->escape_char) |
238 | return get_toplvl_escape (lr); |
239 | |
240 | /* Match ellipsis. */ |
241 | if (ch == '.') |
242 | { |
243 | if (strncmp (&lr->buf[lr->idx], "...(2)...." , 10) == 0) |
244 | { |
245 | int cnt; |
246 | for (cnt = 0; cnt < 10; ++cnt) |
247 | lr_getc (lr); |
248 | lr->token.tok = tok_ellipsis4_2; |
249 | return &lr->token; |
250 | } |
251 | if (strncmp (&lr->buf[lr->idx], "..." , 3) == 0) |
252 | { |
253 | lr_getc (lr); |
254 | lr_getc (lr); |
255 | lr_getc (lr); |
256 | lr->token.tok = tok_ellipsis4; |
257 | return &lr->token; |
258 | } |
259 | if (strncmp (&lr->buf[lr->idx], ".." , 2) == 0) |
260 | { |
261 | lr_getc (lr); |
262 | lr_getc (lr); |
263 | lr->token.tok = tok_ellipsis3; |
264 | return &lr->token; |
265 | } |
266 | if (strncmp (&lr->buf[lr->idx], ".(2).." , 6) == 0) |
267 | { |
268 | int cnt; |
269 | for (cnt = 0; cnt < 6; ++cnt) |
270 | lr_getc (lr); |
271 | lr->token.tok = tok_ellipsis2_2; |
272 | return &lr->token; |
273 | } |
274 | if (lr->buf[lr->idx] == '.') |
275 | { |
276 | lr_getc (lr); |
277 | lr->token.tok = tok_ellipsis2; |
278 | return &lr->token; |
279 | } |
280 | } |
281 | |
282 | switch (ch) |
283 | { |
284 | case '<': |
285 | return get_symname (lr); |
286 | |
287 | case '0' ... '9': |
288 | lr->token.tok = tok_number; |
289 | lr->token.val.num = ch - '0'; |
290 | |
291 | while (isdigit (ch = lr_getc (lr))) |
292 | { |
293 | lr->token.val.num *= 10; |
294 | lr->token.val.num += ch - '0'; |
295 | } |
296 | if (isalpha (ch)) |
297 | lr_error (lr, _("garbage at end of number" )); |
298 | lr_ungetn (lr, 1); |
299 | |
300 | return &lr->token; |
301 | |
302 | case ';': |
303 | lr->token.tok = tok_semicolon; |
304 | return &lr->token; |
305 | |
306 | case ',': |
307 | lr->token.tok = tok_comma; |
308 | return &lr->token; |
309 | |
310 | case '(': |
311 | lr->token.tok = tok_open_brace; |
312 | return &lr->token; |
313 | |
314 | case ')': |
315 | lr->token.tok = tok_close_brace; |
316 | return &lr->token; |
317 | |
318 | case '"': |
319 | return get_string (lr, charmap, locale, repertoire, verbose); |
320 | |
321 | case '-': |
322 | ch = lr_getc (lr); |
323 | if (ch == '1') |
324 | { |
325 | lr->token.tok = tok_minus1; |
326 | return &lr->token; |
327 | } |
328 | lr_ungetn (lr, 2); |
329 | break; |
330 | } |
331 | |
332 | return get_ident (lr); |
333 | } |
334 | |
335 | |
336 | static struct token * |
337 | get_toplvl_escape (struct linereader *lr) |
338 | { |
339 | /* This is supposed to be a numeric value. We return the |
340 | numerical value and the number of bytes. */ |
341 | size_t start_idx = lr->idx - 1; |
342 | unsigned char *bytes = lr->token.val.charcode.bytes; |
343 | size_t nbytes = 0; |
344 | int ch; |
345 | |
346 | do |
347 | { |
348 | unsigned int byte = 0; |
349 | unsigned int base = 8; |
350 | |
351 | ch = lr_getc (lr); |
352 | |
353 | if (ch == 'd') |
354 | { |
355 | base = 10; |
356 | ch = lr_getc (lr); |
357 | } |
358 | else if (ch == 'x') |
359 | { |
360 | base = 16; |
361 | ch = lr_getc (lr); |
362 | } |
363 | |
364 | if ((base == 16 && !isxdigit (ch)) |
365 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
366 | { |
367 | esc_error: |
368 | lr->token.val.str.startmb = &lr->buf[start_idx]; |
369 | |
370 | while (ch != EOF && !isspace (ch)) |
371 | ch = lr_getc (lr); |
372 | lr->token.val.str.lenmb = lr->idx - start_idx; |
373 | |
374 | lr->token.tok = tok_error; |
375 | return &lr->token; |
376 | } |
377 | |
378 | if (isdigit (ch)) |
379 | byte = ch - '0'; |
380 | else |
381 | byte = tolower (ch) - 'a' + 10; |
382 | |
383 | ch = lr_getc (lr); |
384 | if ((base == 16 && !isxdigit (ch)) |
385 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
386 | goto esc_error; |
387 | |
388 | byte *= base; |
389 | if (isdigit (ch)) |
390 | byte += ch - '0'; |
391 | else |
392 | byte += tolower (ch) - 'a' + 10; |
393 | |
394 | ch = lr_getc (lr); |
395 | if (base != 16 && isdigit (ch)) |
396 | { |
397 | byte *= base; |
398 | byte += ch - '0'; |
399 | |
400 | ch = lr_getc (lr); |
401 | } |
402 | |
403 | bytes[nbytes++] = byte; |
404 | } |
405 | while (ch == lr->escape_char |
406 | && nbytes < (int) sizeof (lr->token.val.charcode.bytes)); |
407 | |
408 | if (!isspace (ch)) |
409 | lr_error (lr, _("garbage at end of character code specification" )); |
410 | |
411 | lr_ungetn (lr, 1); |
412 | |
413 | lr->token.tok = tok_charcode; |
414 | lr->token.val.charcode.nbytes = nbytes; |
415 | |
416 | return &lr->token; |
417 | } |
418 | |
419 | |
420 | #define ADDC(ch) \ |
421 | do \ |
422 | { \ |
423 | if (bufact == bufmax) \ |
424 | { \ |
425 | bufmax *= 2; \ |
426 | buf = xrealloc (buf, bufmax); \ |
427 | } \ |
428 | buf[bufact++] = (ch); \ |
429 | } \ |
430 | while (0) |
431 | |
432 | |
433 | #define ADDS(s, l) \ |
434 | do \ |
435 | { \ |
436 | size_t _l = (l); \ |
437 | if (bufact + _l > bufmax) \ |
438 | { \ |
439 | if (bufact < _l) \ |
440 | bufact = _l; \ |
441 | bufmax *= 2; \ |
442 | buf = xrealloc (buf, bufmax); \ |
443 | } \ |
444 | memcpy (&buf[bufact], s, _l); \ |
445 | bufact += _l; \ |
446 | } \ |
447 | while (0) |
448 | |
449 | |
450 | #define ADDWC(ch) \ |
451 | do \ |
452 | { \ |
453 | if (buf2act == buf2max) \ |
454 | { \ |
455 | buf2max *= 2; \ |
456 | buf2 = xrealloc (buf2, buf2max * 4); \ |
457 | } \ |
458 | buf2[buf2act++] = (ch); \ |
459 | } \ |
460 | while (0) |
461 | |
462 | |
463 | static struct token * |
464 | get_symname (struct linereader *lr) |
465 | { |
466 | /* Symbol in brackets. We must distinguish three kinds: |
467 | 1. reserved words |
468 | 2. ISO 10646 position values |
469 | 3. all other. */ |
470 | char *buf; |
471 | size_t bufact = 0; |
472 | size_t bufmax = 56; |
473 | const struct keyword_t *kw; |
474 | int ch; |
475 | |
476 | buf = (char *) xmalloc (bufmax); |
477 | |
478 | do |
479 | { |
480 | ch = lr_getc (lr); |
481 | if (ch == lr->escape_char) |
482 | { |
483 | int c2 = lr_getc (lr); |
484 | ADDC (c2); |
485 | |
486 | if (c2 == '\n') |
487 | ch = '\n'; |
488 | } |
489 | else |
490 | ADDC (ch); |
491 | } |
492 | while (ch != '>' && ch != '\n'); |
493 | |
494 | if (ch == '\n') |
495 | lr_error (lr, _("unterminated symbolic name" )); |
496 | |
497 | /* Test for ISO 10646 position value. */ |
498 | if (buf[0] == 'U' && (bufact == 6 || bufact == 10)) |
499 | { |
500 | char *cp = buf + 1; |
501 | while (cp < &buf[bufact - 1] && isxdigit (*cp)) |
502 | ++cp; |
503 | |
504 | if (cp == &buf[bufact - 1]) |
505 | { |
506 | /* Yes, it is. */ |
507 | lr->token.tok = tok_ucs4; |
508 | lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16); |
509 | |
510 | return &lr->token; |
511 | } |
512 | } |
513 | |
514 | /* It is a symbolic name. Test for reserved words. */ |
515 | kw = lr->hash_fct (buf, bufact - 1); |
516 | |
517 | if (kw != NULL && kw->symname_or_ident == 1) |
518 | { |
519 | lr->token.tok = kw->token; |
520 | free (buf); |
521 | } |
522 | else |
523 | { |
524 | lr->token.tok = tok_bsymbol; |
525 | |
526 | buf = xrealloc (buf, bufact + 1); |
527 | buf[bufact] = '\0'; |
528 | |
529 | lr->token.val.str.startmb = buf; |
530 | lr->token.val.str.lenmb = bufact - 1; |
531 | } |
532 | |
533 | return &lr->token; |
534 | } |
535 | |
536 | |
537 | static struct token * |
538 | get_ident (struct linereader *lr) |
539 | { |
540 | char *buf; |
541 | size_t bufact; |
542 | size_t bufmax = 56; |
543 | const struct keyword_t *kw; |
544 | int ch; |
545 | |
546 | buf = xmalloc (bufmax); |
547 | bufact = 0; |
548 | |
549 | ADDC (lr->buf[lr->idx - 1]); |
550 | |
551 | while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' |
552 | && ch != '<' && ch != ',' && ch != EOF) |
553 | { |
554 | if (ch == lr->escape_char) |
555 | { |
556 | ch = lr_getc (lr); |
557 | if (ch == '\n' || ch == EOF) |
558 | { |
559 | lr_error (lr, _("invalid escape sequence" )); |
560 | break; |
561 | } |
562 | } |
563 | ADDC (ch); |
564 | } |
565 | |
566 | lr_ungetc (lr, ch); |
567 | |
568 | kw = lr->hash_fct (buf, bufact); |
569 | |
570 | if (kw != NULL && kw->symname_or_ident == 0) |
571 | { |
572 | lr->token.tok = kw->token; |
573 | free (buf); |
574 | } |
575 | else |
576 | { |
577 | lr->token.tok = tok_ident; |
578 | |
579 | buf = xrealloc (buf, bufact + 1); |
580 | buf[bufact] = '\0'; |
581 | |
582 | lr->token.val.str.startmb = buf; |
583 | lr->token.val.str.lenmb = bufact; |
584 | } |
585 | |
586 | return &lr->token; |
587 | } |
588 | |
589 | |
590 | static struct token * |
591 | get_string (struct linereader *lr, const struct charmap_t *charmap, |
592 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
593 | int verbose) |
594 | { |
595 | int return_widestr = lr->return_widestr; |
596 | char *buf; |
597 | wchar_t *buf2 = NULL; |
598 | size_t bufact; |
599 | size_t bufmax = 56; |
600 | |
601 | /* We must return two different strings. */ |
602 | buf = xmalloc (bufmax); |
603 | bufact = 0; |
604 | |
605 | /* We know it'll be a string. */ |
606 | lr->token.tok = tok_string; |
607 | |
608 | /* If we need not translate the strings (i.e., expand <...> parts) |
609 | we can run a simple loop. */ |
610 | if (!lr->translate_strings) |
611 | { |
612 | int ch; |
613 | |
614 | buf2 = NULL; |
615 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) |
616 | ADDC (ch); |
617 | |
618 | /* Catch errors with trailing escape character. */ |
619 | if (bufact > 0 && buf[bufact - 1] == lr->escape_char |
620 | && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) |
621 | { |
622 | lr_error (lr, _("illegal escape sequence at end of string" )); |
623 | --bufact; |
624 | } |
625 | else if (ch == '\n' || ch == EOF) |
626 | lr_error (lr, _("unterminated string" )); |
627 | |
628 | ADDC ('\0'); |
629 | } |
630 | else |
631 | { |
632 | int illegal_string = 0; |
633 | size_t buf2act = 0; |
634 | size_t buf2max = 56 * sizeof (uint32_t); |
635 | int ch; |
636 | |
637 | /* We have to provide the wide character result as well. */ |
638 | if (return_widestr) |
639 | buf2 = xmalloc (buf2max); |
640 | |
641 | /* Read until the end of the string (or end of the line or file). */ |
642 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) |
643 | { |
644 | size_t startidx; |
645 | uint32_t wch; |
646 | struct charseq *seq; |
647 | |
648 | if (ch != '<') |
649 | { |
650 | /* The standards leave it up to the implementation to decide |
651 | what to do with character which stand for themself. We |
652 | could jump through hoops to find out the value relative to |
653 | the charmap and the repertoire map, but instead we leave |
654 | it up to the locale definition author to write a better |
655 | definition. We assume here that every character which |
656 | stands for itself is encoded using ISO 8859-1. Using the |
657 | escape character is allowed. */ |
658 | if (ch == lr->escape_char) |
659 | { |
660 | ch = lr_getc (lr); |
661 | if (ch == '\n' || ch == EOF) |
662 | break; |
663 | } |
664 | |
665 | ADDC (ch); |
666 | if (return_widestr) |
667 | ADDWC ((uint32_t) ch); |
668 | |
669 | continue; |
670 | } |
671 | |
672 | /* Now we have to search for the end of the symbolic name, i.e., |
673 | the closing '>'. */ |
674 | startidx = bufact; |
675 | while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) |
676 | { |
677 | if (ch == lr->escape_char) |
678 | { |
679 | ch = lr_getc (lr); |
680 | if (ch == '\n' || ch == EOF) |
681 | break; |
682 | } |
683 | ADDC (ch); |
684 | } |
685 | if (ch == '\n' || ch == EOF) |
686 | /* Not a correct string. */ |
687 | break; |
688 | if (bufact == startidx) |
689 | { |
690 | /* <> is no correct name. Ignore it and also signal an |
691 | error. */ |
692 | illegal_string = 1; |
693 | continue; |
694 | } |
695 | |
696 | /* It might be a Uxxxx symbol. */ |
697 | if (buf[startidx] == 'U' |
698 | && (bufact - startidx == 5 || bufact - startidx == 9)) |
699 | { |
700 | char *cp = buf + startidx + 1; |
701 | while (cp < &buf[bufact] && isxdigit (*cp)) |
702 | ++cp; |
703 | |
704 | if (cp == &buf[bufact]) |
705 | { |
706 | char utmp[10]; |
707 | |
708 | /* Yes, it is. */ |
709 | ADDC ('\0'); |
710 | wch = strtoul (buf + startidx + 1, NULL, 16); |
711 | |
712 | /* Now forget about the name we just added. */ |
713 | bufact = startidx; |
714 | |
715 | if (return_widestr) |
716 | ADDWC (wch); |
717 | |
718 | /* See whether the charmap contains the Uxxxxxxxx names. */ |
719 | snprintf (utmp, sizeof (utmp), "U%08X" , wch); |
720 | seq = charmap_find_value (charmap, utmp, 9); |
721 | |
722 | if (seq == NULL) |
723 | { |
724 | /* No, this isn't the case. Now determine from |
725 | the repertoire the name of the character and |
726 | find it in the charmap. */ |
727 | if (repertoire != NULL) |
728 | { |
729 | const char *symbol; |
730 | |
731 | symbol = repertoire_find_symbol (repertoire, wch); |
732 | |
733 | if (symbol != NULL) |
734 | seq = charmap_find_value (charmap, symbol, |
735 | strlen (symbol)); |
736 | } |
737 | |
738 | if (seq == NULL) |
739 | { |
740 | #ifndef NO_TRANSLITERATION |
741 | /* Transliterate if possible. */ |
742 | if (locale != NULL) |
743 | { |
744 | uint32_t *translit; |
745 | |
746 | if ((locale->avail & CTYPE_LOCALE) == 0) |
747 | { |
748 | /* Load the CTYPE data now. */ |
749 | int old_needed = locale->needed; |
750 | |
751 | locale->needed = 0; |
752 | locale = load_locale (LC_CTYPE, |
753 | locale->name, |
754 | locale->repertoire_name, |
755 | charmap, locale); |
756 | locale->needed = old_needed; |
757 | } |
758 | |
759 | if ((locale->avail & CTYPE_LOCALE) != 0 |
760 | && ((translit = find_translit (locale, |
761 | charmap, wch)) |
762 | != NULL)) |
763 | /* The CTYPE data contains a matching |
764 | transliteration. */ |
765 | { |
766 | int i; |
767 | |
768 | for (i = 0; translit[i] != 0; ++i) |
769 | { |
770 | char utmp[10]; |
771 | |
772 | snprintf (utmp, sizeof (utmp), "U%08X" , |
773 | translit[i]); |
774 | seq = charmap_find_value (charmap, utmp, |
775 | 9); |
776 | assert (seq != NULL); |
777 | ADDS (seq->bytes, seq->nbytes); |
778 | } |
779 | |
780 | continue; |
781 | } |
782 | } |
783 | #endif /* NO_TRANSLITERATION */ |
784 | |
785 | /* Not a known name. */ |
786 | illegal_string = 1; |
787 | } |
788 | } |
789 | |
790 | if (seq != NULL) |
791 | ADDS (seq->bytes, seq->nbytes); |
792 | |
793 | continue; |
794 | } |
795 | } |
796 | |
797 | /* We now have the symbolic name in buf[startidx] to |
798 | buf[bufact-1]. Now find out the value for this character |
799 | in the charmap as well as in the repertoire map (in this |
800 | order). */ |
801 | seq = charmap_find_value (charmap, &buf[startidx], |
802 | bufact - startidx); |
803 | |
804 | if (seq == NULL) |
805 | { |
806 | /* This name is not in the charmap. */ |
807 | lr_error (lr, _("symbol `%.*s' not in charmap" ), |
808 | (int) (bufact - startidx), &buf[startidx]); |
809 | illegal_string = 1; |
810 | } |
811 | |
812 | if (return_widestr) |
813 | { |
814 | /* Now the same for the multibyte representation. */ |
815 | if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) |
816 | wch = seq->ucs4; |
817 | else |
818 | { |
819 | wch = repertoire_find_value (repertoire, &buf[startidx], |
820 | bufact - startidx); |
821 | if (seq != NULL) |
822 | seq->ucs4 = wch; |
823 | } |
824 | |
825 | if (wch == ILLEGAL_CHAR_VALUE) |
826 | { |
827 | /* This name is not in the repertoire map. */ |
828 | lr_error (lr, _("symbol `%.*s' not in repertoire map" ), |
829 | (int) (bufact - startidx), &buf[startidx]); |
830 | illegal_string = 1; |
831 | } |
832 | else |
833 | ADDWC (wch); |
834 | } |
835 | |
836 | /* Now forget about the name we just added. */ |
837 | bufact = startidx; |
838 | |
839 | /* And copy the bytes. */ |
840 | if (seq != NULL) |
841 | ADDS (seq->bytes, seq->nbytes); |
842 | } |
843 | |
844 | if (ch == '\n' || ch == EOF) |
845 | { |
846 | lr_error (lr, _("unterminated string" )); |
847 | illegal_string = 1; |
848 | } |
849 | |
850 | if (illegal_string) |
851 | { |
852 | free (buf); |
853 | free (buf2); |
854 | lr->token.val.str.startmb = NULL; |
855 | lr->token.val.str.lenmb = 0; |
856 | lr->token.val.str.startwc = NULL; |
857 | lr->token.val.str.lenwc = 0; |
858 | |
859 | return &lr->token; |
860 | } |
861 | |
862 | ADDC ('\0'); |
863 | |
864 | if (return_widestr) |
865 | { |
866 | ADDWC (0); |
867 | lr->token.val.str.startwc = xrealloc (buf2, |
868 | buf2act * sizeof (uint32_t)); |
869 | lr->token.val.str.lenwc = buf2act; |
870 | } |
871 | } |
872 | |
873 | lr->token.val.str.startmb = xrealloc (buf, bufact); |
874 | lr->token.val.str.lenmb = bufact; |
875 | |
876 | return &lr->token; |
877 | } |
878 | |