| 1 | /* Copyright (C) 1996-2022 Free Software Foundation, Inc. | 
| 2 |    This file is part of the GNU C Library. | 
| 3 |  | 
| 4 |    This program is free software; you can redistribute it and/or modify | 
| 5 |    it under the terms of the GNU General Public License as published | 
| 6 |    by the Free Software Foundation; version 2 of the License, or | 
| 7 |    (at your option) any later version. | 
| 8 |  | 
| 9 |    This program is distributed in the hope that it will be useful, | 
| 10 |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 11 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| 12 |    GNU General Public License for more details. | 
| 13 |  | 
| 14 |    You should have received a copy of the GNU General Public License | 
| 15 |    along with this program; if not, see <https://www.gnu.org/licenses/>.  */ | 
| 16 |  | 
| 17 | #ifdef HAVE_CONFIG_H | 
| 18 | # include <config.h> | 
| 19 | #endif | 
| 20 |  | 
| 21 | #include <assert.h> | 
| 22 | #include <ctype.h> | 
| 23 | #include <errno.h> | 
| 24 | #include <libintl.h> | 
| 25 | #include <stdarg.h> | 
| 26 | #include <stdlib.h> | 
| 27 | #include <string.h> | 
| 28 | #include <stdint.h> | 
| 29 |  | 
| 30 | #include "localedef.h" | 
| 31 | #include "charmap.h" | 
| 32 | #include "error.h" | 
| 33 | #include "linereader.h" | 
| 34 | #include "locfile.h" | 
| 35 |  | 
| 36 | /* Prototypes for local functions.  */ | 
| 37 | static struct token *get_toplvl_escape (struct linereader *lr); | 
| 38 | static struct token *get_symname (struct linereader *lr); | 
| 39 | static struct token *get_ident (struct linereader *lr); | 
| 40 | static struct token *get_string (struct linereader *lr, | 
| 41 | 				 const struct charmap_t *charmap, | 
| 42 | 				 struct localedef_t *locale, | 
| 43 | 				 const struct repertoire_t *repertoire, | 
| 44 | 				 int verbose); | 
| 45 |  | 
| 46 |  | 
| 47 | struct linereader * | 
| 48 | lr_open (const char *fname, kw_hash_fct_t hf) | 
| 49 | { | 
| 50 |   FILE *fp; | 
| 51 |  | 
| 52 |   if (fname == NULL || strcmp (fname, "-" ) == 0 | 
| 53 |       || strcmp (fname, "/dev/stdin" ) == 0) | 
| 54 |     return lr_create (stdin, "<stdin>" , hf); | 
| 55 |   else | 
| 56 |     { | 
| 57 |       fp = fopen (fname, "rm" ); | 
| 58 |       if (fp == NULL) | 
| 59 | 	return NULL; | 
| 60 |       return lr_create (fp, fname, hf); | 
| 61 |     } | 
| 62 | } | 
| 63 |  | 
| 64 | struct linereader * | 
| 65 | lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) | 
| 66 | { | 
| 67 |   struct linereader *result; | 
| 68 |   int n; | 
| 69 |  | 
| 70 |   result = (struct linereader *) xmalloc (sizeof (*result)); | 
| 71 |  | 
| 72 |   result->fp = fp; | 
| 73 |   result->fname = xstrdup (fname); | 
| 74 |   result->buf = NULL; | 
| 75 |   result->bufsize = 0; | 
| 76 |   result->lineno = 1; | 
| 77 |   result->idx = 0; | 
| 78 |   result->comment_char = '#'; | 
| 79 |   result->escape_char = '\\'; | 
| 80 |   result->translate_strings = 1; | 
| 81 |   result->return_widestr = 0; | 
| 82 |  | 
| 83 |   n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); | 
| 84 |   if (n < 0) | 
| 85 |     { | 
| 86 |       int save = errno; | 
| 87 |       fclose (result->fp); | 
| 88 |       free ((char *) result->fname); | 
| 89 |       free (result); | 
| 90 |       errno = save; | 
| 91 |       return NULL; | 
| 92 |     } | 
| 93 |  | 
| 94 |   if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') | 
| 95 |     n -= 2; | 
| 96 |  | 
| 97 |   result->buf[n] = '\0'; | 
| 98 |   result->bufact = n; | 
| 99 |   result->hash_fct = hf; | 
| 100 |  | 
| 101 |   return result; | 
| 102 | } | 
| 103 |  | 
| 104 |  | 
| 105 | int | 
| 106 | lr_eof (struct linereader *lr) | 
| 107 | { | 
| 108 |   return lr->bufact = 0; | 
| 109 | } | 
| 110 |  | 
| 111 |  | 
| 112 | void | 
| 113 | lr_ignore_rest (struct linereader *lr, int verbose) | 
| 114 | { | 
| 115 |   if (verbose) | 
| 116 |     { | 
| 117 |       while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n' | 
| 118 | 	     && lr->buf[lr->idx] != lr->comment_char) | 
| 119 | 	if (lr->buf[lr->idx] == '\0') | 
| 120 | 	  { | 
| 121 | 	    if (lr_next (lr) < 0) | 
| 122 | 	      return; | 
| 123 | 	  } | 
| 124 | 	else | 
| 125 | 	  ++lr->idx; | 
| 126 |  | 
| 127 |       if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp) | 
| 128 | 	  && lr->buf[lr->idx] != lr->comment_char) | 
| 129 | 	lr_error (lr, _("trailing garbage at end of line" )); | 
| 130 |     } | 
| 131 |  | 
| 132 |   /* Ignore continued line.  */ | 
| 133 |   while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n') | 
| 134 |     if (lr_next (lr) < 0) | 
| 135 |       break; | 
| 136 |  | 
| 137 |   lr->idx = lr->bufact; | 
| 138 | } | 
| 139 |  | 
| 140 |  | 
| 141 | void | 
| 142 | lr_close (struct linereader *lr) | 
| 143 | { | 
| 144 |   fclose (lr->fp); | 
| 145 |   free (lr->buf); | 
| 146 |   free (lr); | 
| 147 | } | 
| 148 |  | 
| 149 |  | 
| 150 | int | 
| 151 | lr_next (struct linereader *lr) | 
| 152 | { | 
| 153 |   int n; | 
| 154 |  | 
| 155 |   n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); | 
| 156 |   if (n < 0) | 
| 157 |     return -1; | 
| 158 |  | 
| 159 |   ++lr->lineno; | 
| 160 |  | 
| 161 |   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') | 
| 162 |     { | 
| 163 | #if 0 | 
| 164 |       /* XXX Is this correct?  */ | 
| 165 |       /* An escaped newline character is substituted with a single <SP>.  */ | 
| 166 |       --n; | 
| 167 |       lr->buf[n - 1] = ' '; | 
| 168 | #else | 
| 169 |       n -= 2; | 
| 170 | #endif | 
| 171 |     } | 
| 172 |  | 
| 173 |   lr->buf[n] = '\0'; | 
| 174 |   lr->bufact = n; | 
| 175 |   lr->idx = 0; | 
| 176 |  | 
| 177 |   return 0; | 
| 178 | } | 
| 179 |  | 
| 180 |  | 
| 181 | /* Defined in error.c.  */ | 
| 182 | /* This variable is incremented each time `error' is called.  */ | 
| 183 | extern unsigned int error_message_count; | 
| 184 |  | 
| 185 | /* The calling program should define program_name and set it to the | 
| 186 |    name of the executing program.  */ | 
| 187 | extern char *program_name; | 
| 188 |  | 
| 189 |  | 
| 190 | struct token * | 
| 191 | lr_token (struct linereader *lr, const struct charmap_t *charmap, | 
| 192 | 	  struct localedef_t *locale, const struct repertoire_t *repertoire, | 
| 193 | 	  int verbose) | 
| 194 | { | 
| 195 |   int ch; | 
| 196 |  | 
| 197 |   while (1) | 
| 198 |     { | 
| 199 |       do | 
| 200 | 	{ | 
| 201 | 	  ch = lr_getc (lr); | 
| 202 |  | 
| 203 | 	  if (ch == EOF) | 
| 204 | 	    { | 
| 205 | 	      lr->token.tok = tok_eof; | 
| 206 | 	      return &lr->token; | 
| 207 | 	    }; | 
| 208 |  | 
| 209 | 	  if (ch == '\n') | 
| 210 | 	    { | 
| 211 | 	      lr->token.tok = tok_eol; | 
| 212 | 	      return &lr->token; | 
| 213 | 	    } | 
| 214 | 	} | 
| 215 |       while (isspace (ch)); | 
| 216 |  | 
| 217 |       if (ch != lr->comment_char) | 
| 218 | 	break; | 
| 219 |  | 
| 220 |       /* Is there an newline at the end of the buffer?  */ | 
| 221 |       if (lr->buf[lr->bufact - 1] != '\n') | 
| 222 | 	{ | 
| 223 | 	  /* No.  Some people want this to mean that only the line in | 
| 224 | 	     the file not the logical, concatenated line is ignored. | 
| 225 | 	     Let's try this.  */ | 
| 226 | 	  lr->idx = lr->bufact; | 
| 227 | 	  continue; | 
| 228 | 	} | 
| 229 |  | 
| 230 |       /* Ignore rest of line.  */ | 
| 231 |       lr_ignore_rest (lr, 0); | 
| 232 |       lr->token.tok = tok_eol; | 
| 233 |       return &lr->token; | 
| 234 |     } | 
| 235 |  | 
| 236 |   /* Match escape sequences.  */ | 
| 237 |   if (ch == lr->escape_char) | 
| 238 |     return get_toplvl_escape (lr); | 
| 239 |  | 
| 240 |   /* Match ellipsis.  */ | 
| 241 |   if (ch == '.') | 
| 242 |     { | 
| 243 |       if (strncmp (&lr->buf[lr->idx], "...(2)...." , 10) == 0) | 
| 244 | 	{ | 
| 245 | 	  int cnt; | 
| 246 | 	  for (cnt = 0; cnt < 10; ++cnt) | 
| 247 | 	    lr_getc (lr); | 
| 248 | 	  lr->token.tok = tok_ellipsis4_2; | 
| 249 | 	  return &lr->token; | 
| 250 | 	} | 
| 251 |       if (strncmp (&lr->buf[lr->idx], "..." , 3) == 0) | 
| 252 | 	{ | 
| 253 | 	  lr_getc (lr); | 
| 254 | 	  lr_getc (lr); | 
| 255 | 	  lr_getc (lr); | 
| 256 | 	  lr->token.tok = tok_ellipsis4; | 
| 257 | 	  return &lr->token; | 
| 258 | 	} | 
| 259 |       if (strncmp (&lr->buf[lr->idx], ".." , 2) == 0) | 
| 260 | 	{ | 
| 261 | 	  lr_getc (lr); | 
| 262 | 	  lr_getc (lr); | 
| 263 | 	  lr->token.tok = tok_ellipsis3; | 
| 264 | 	  return &lr->token; | 
| 265 | 	} | 
| 266 |       if (strncmp (&lr->buf[lr->idx], ".(2).." , 6) == 0) | 
| 267 | 	{ | 
| 268 | 	  int cnt; | 
| 269 | 	  for (cnt = 0; cnt < 6; ++cnt) | 
| 270 | 	    lr_getc (lr); | 
| 271 | 	  lr->token.tok = tok_ellipsis2_2; | 
| 272 | 	  return &lr->token; | 
| 273 | 	} | 
| 274 |       if (lr->buf[lr->idx] == '.') | 
| 275 | 	{ | 
| 276 | 	  lr_getc (lr); | 
| 277 | 	  lr->token.tok = tok_ellipsis2; | 
| 278 | 	  return &lr->token; | 
| 279 | 	} | 
| 280 |     } | 
| 281 |  | 
| 282 |   switch (ch) | 
| 283 |     { | 
| 284 |     case '<': | 
| 285 |       return get_symname (lr); | 
| 286 |  | 
| 287 |     case '0' ... '9': | 
| 288 |       lr->token.tok = tok_number; | 
| 289 |       lr->token.val.num = ch - '0'; | 
| 290 |  | 
| 291 |       while (isdigit (ch = lr_getc (lr))) | 
| 292 | 	{ | 
| 293 | 	  lr->token.val.num *= 10; | 
| 294 | 	  lr->token.val.num += ch - '0'; | 
| 295 | 	} | 
| 296 |       if (isalpha (ch)) | 
| 297 | 	lr_error (lr, _("garbage at end of number" )); | 
| 298 |       lr_ungetn (lr, 1); | 
| 299 |  | 
| 300 |       return &lr->token; | 
| 301 |  | 
| 302 |     case ';': | 
| 303 |       lr->token.tok = tok_semicolon; | 
| 304 |       return &lr->token; | 
| 305 |  | 
| 306 |     case ',': | 
| 307 |       lr->token.tok = tok_comma; | 
| 308 |       return &lr->token; | 
| 309 |  | 
| 310 |     case '(': | 
| 311 |       lr->token.tok = tok_open_brace; | 
| 312 |       return &lr->token; | 
| 313 |  | 
| 314 |     case ')': | 
| 315 |       lr->token.tok = tok_close_brace; | 
| 316 |       return &lr->token; | 
| 317 |  | 
| 318 |     case '"': | 
| 319 |       return get_string (lr, charmap, locale, repertoire, verbose); | 
| 320 |  | 
| 321 |     case '-': | 
| 322 |       ch = lr_getc (lr); | 
| 323 |       if (ch == '1') | 
| 324 | 	{ | 
| 325 | 	  lr->token.tok = tok_minus1; | 
| 326 | 	  return &lr->token; | 
| 327 | 	} | 
| 328 |       lr_ungetn (lr, 2); | 
| 329 |       break; | 
| 330 |     } | 
| 331 |  | 
| 332 |   return get_ident (lr); | 
| 333 | } | 
| 334 |  | 
| 335 |  | 
| 336 | static struct token * | 
| 337 | get_toplvl_escape (struct linereader *lr) | 
| 338 | { | 
| 339 |   /* This is supposed to be a numeric value.  We return the | 
| 340 |      numerical value and the number of bytes.  */ | 
| 341 |   size_t start_idx = lr->idx - 1; | 
| 342 |   unsigned char *bytes = lr->token.val.charcode.bytes; | 
| 343 |   size_t nbytes = 0; | 
| 344 |   int ch; | 
| 345 |  | 
| 346 |   do | 
| 347 |     { | 
| 348 |       unsigned int byte = 0; | 
| 349 |       unsigned int base = 8; | 
| 350 |  | 
| 351 |       ch = lr_getc (lr); | 
| 352 |  | 
| 353 |       if (ch == 'd') | 
| 354 | 	{ | 
| 355 | 	  base = 10; | 
| 356 | 	  ch = lr_getc (lr); | 
| 357 | 	} | 
| 358 |       else if (ch == 'x') | 
| 359 | 	{ | 
| 360 | 	  base = 16; | 
| 361 | 	  ch = lr_getc (lr); | 
| 362 | 	} | 
| 363 |  | 
| 364 |       if ((base == 16 && !isxdigit (ch)) | 
| 365 | 	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) | 
| 366 | 	{ | 
| 367 | 	esc_error: | 
| 368 | 	  lr->token.val.str.startmb = &lr->buf[start_idx]; | 
| 369 |  | 
| 370 | 	  while (ch != EOF && !isspace (ch)) | 
| 371 | 	    ch = lr_getc (lr); | 
| 372 | 	  lr->token.val.str.lenmb = lr->idx - start_idx; | 
| 373 |  | 
| 374 | 	  lr->token.tok = tok_error; | 
| 375 | 	  return &lr->token; | 
| 376 | 	} | 
| 377 |  | 
| 378 |       if (isdigit (ch)) | 
| 379 | 	byte = ch - '0'; | 
| 380 |       else | 
| 381 | 	byte = tolower (ch) - 'a' + 10; | 
| 382 |  | 
| 383 |       ch = lr_getc (lr); | 
| 384 |       if ((base == 16 && !isxdigit (ch)) | 
| 385 | 	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) | 
| 386 | 	goto esc_error; | 
| 387 |  | 
| 388 |       byte *= base; | 
| 389 |       if (isdigit (ch)) | 
| 390 | 	byte += ch - '0'; | 
| 391 |       else | 
| 392 | 	byte += tolower (ch) - 'a' + 10; | 
| 393 |  | 
| 394 |       ch = lr_getc (lr); | 
| 395 |       if (base != 16 && isdigit (ch)) | 
| 396 | 	{ | 
| 397 | 	  byte *= base; | 
| 398 | 	  byte += ch - '0'; | 
| 399 |  | 
| 400 | 	  ch = lr_getc (lr); | 
| 401 | 	} | 
| 402 |  | 
| 403 |       bytes[nbytes++] = byte; | 
| 404 |     } | 
| 405 |   while (ch == lr->escape_char | 
| 406 | 	 && nbytes < (int) sizeof (lr->token.val.charcode.bytes)); | 
| 407 |  | 
| 408 |   if (!isspace (ch)) | 
| 409 |     lr_error (lr, _("garbage at end of character code specification" )); | 
| 410 |  | 
| 411 |   lr_ungetn (lr, 1); | 
| 412 |  | 
| 413 |   lr->token.tok = tok_charcode; | 
| 414 |   lr->token.val.charcode.nbytes = nbytes; | 
| 415 |  | 
| 416 |   return &lr->token; | 
| 417 | } | 
| 418 |  | 
| 419 |  | 
| 420 | #define ADDC(ch) \ | 
| 421 |   do									      \ | 
| 422 |     {									      \ | 
| 423 |       if (bufact == bufmax)						      \ | 
| 424 | 	{								      \ | 
| 425 | 	  bufmax *= 2;							      \ | 
| 426 | 	  buf = xrealloc (buf, bufmax);					      \ | 
| 427 | 	}								      \ | 
| 428 |       buf[bufact++] = (ch);						      \ | 
| 429 |     }									      \ | 
| 430 |   while (0) | 
| 431 |  | 
| 432 |  | 
| 433 | #define ADDS(s, l) \ | 
| 434 |   do									      \ | 
| 435 |     {									      \ | 
| 436 |       size_t _l = (l);							      \ | 
| 437 |       if (bufact + _l > bufmax)						      \ | 
| 438 | 	{								      \ | 
| 439 | 	  if (bufact < _l)						      \ | 
| 440 | 	    bufact = _l;						      \ | 
| 441 | 	  bufmax *= 2;							      \ | 
| 442 | 	  buf = xrealloc (buf, bufmax);					      \ | 
| 443 | 	}								      \ | 
| 444 |       memcpy (&buf[bufact], s, _l);					      \ | 
| 445 |       bufact += _l;							      \ | 
| 446 |     }									      \ | 
| 447 |   while (0) | 
| 448 |  | 
| 449 |  | 
| 450 | #define ADDWC(ch) \ | 
| 451 |   do									      \ | 
| 452 |     {									      \ | 
| 453 |       if (buf2act == buf2max)						      \ | 
| 454 | 	{								      \ | 
| 455 | 	  buf2max *= 2;							      \ | 
| 456 | 	  buf2 = xrealloc (buf2, buf2max * 4);				      \ | 
| 457 | 	}								      \ | 
| 458 |       buf2[buf2act++] = (ch);						      \ | 
| 459 |     }									      \ | 
| 460 |   while (0) | 
| 461 |  | 
| 462 |  | 
| 463 | static struct token * | 
| 464 | get_symname (struct linereader *lr) | 
| 465 | { | 
| 466 |   /* Symbol in brackets.  We must distinguish three kinds: | 
| 467 |      1. reserved words | 
| 468 |      2. ISO 10646 position values | 
| 469 |      3. all other.  */ | 
| 470 |   char *buf; | 
| 471 |   size_t bufact = 0; | 
| 472 |   size_t bufmax = 56; | 
| 473 |   const struct keyword_t *kw; | 
| 474 |   int ch; | 
| 475 |  | 
| 476 |   buf = (char *) xmalloc (bufmax); | 
| 477 |  | 
| 478 |   do | 
| 479 |     { | 
| 480 |       ch = lr_getc (lr); | 
| 481 |       if (ch == lr->escape_char) | 
| 482 | 	{ | 
| 483 | 	  int c2 = lr_getc (lr); | 
| 484 | 	  ADDC (c2); | 
| 485 |  | 
| 486 | 	  if (c2 == '\n') | 
| 487 | 	    ch = '\n'; | 
| 488 | 	} | 
| 489 |       else | 
| 490 | 	ADDC (ch); | 
| 491 |     } | 
| 492 |   while (ch != '>' && ch != '\n'); | 
| 493 |  | 
| 494 |   if (ch == '\n') | 
| 495 |     lr_error (lr, _("unterminated symbolic name" )); | 
| 496 |  | 
| 497 |   /* Test for ISO 10646 position value.  */ | 
| 498 |   if (buf[0] == 'U' && (bufact == 6 || bufact == 10)) | 
| 499 |     { | 
| 500 |       char *cp = buf + 1; | 
| 501 |       while (cp < &buf[bufact - 1] && isxdigit (*cp)) | 
| 502 | 	++cp; | 
| 503 |  | 
| 504 |       if (cp == &buf[bufact - 1]) | 
| 505 | 	{ | 
| 506 | 	  /* Yes, it is.  */ | 
| 507 | 	  lr->token.tok = tok_ucs4; | 
| 508 | 	  lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16); | 
| 509 |  | 
| 510 | 	  return &lr->token; | 
| 511 | 	} | 
| 512 |     } | 
| 513 |  | 
| 514 |   /* It is a symbolic name.  Test for reserved words.  */ | 
| 515 |   kw = lr->hash_fct (buf, bufact - 1); | 
| 516 |  | 
| 517 |   if (kw != NULL && kw->symname_or_ident == 1) | 
| 518 |     { | 
| 519 |       lr->token.tok = kw->token; | 
| 520 |       free (buf); | 
| 521 |     } | 
| 522 |   else | 
| 523 |     { | 
| 524 |       lr->token.tok = tok_bsymbol; | 
| 525 |  | 
| 526 |       buf = xrealloc (buf, bufact + 1); | 
| 527 |       buf[bufact] = '\0'; | 
| 528 |  | 
| 529 |       lr->token.val.str.startmb = buf; | 
| 530 |       lr->token.val.str.lenmb = bufact - 1; | 
| 531 |     } | 
| 532 |  | 
| 533 |   return &lr->token; | 
| 534 | } | 
| 535 |  | 
| 536 |  | 
| 537 | static struct token * | 
| 538 | get_ident (struct linereader *lr) | 
| 539 | { | 
| 540 |   char *buf; | 
| 541 |   size_t bufact; | 
| 542 |   size_t bufmax = 56; | 
| 543 |   const struct keyword_t *kw; | 
| 544 |   int ch; | 
| 545 |  | 
| 546 |   buf = xmalloc (bufmax); | 
| 547 |   bufact = 0; | 
| 548 |  | 
| 549 |   ADDC (lr->buf[lr->idx - 1]); | 
| 550 |  | 
| 551 |   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' | 
| 552 | 	 && ch != '<' && ch != ',' && ch != EOF) | 
| 553 |     { | 
| 554 |       if (ch == lr->escape_char) | 
| 555 | 	{ | 
| 556 | 	  ch = lr_getc (lr); | 
| 557 | 	  if (ch == '\n' || ch == EOF) | 
| 558 | 	    { | 
| 559 | 	      lr_error (lr, _("invalid escape sequence" )); | 
| 560 | 	      break; | 
| 561 | 	    } | 
| 562 | 	} | 
| 563 |       ADDC (ch); | 
| 564 |     } | 
| 565 |  | 
| 566 |   lr_ungetc (lr, ch); | 
| 567 |  | 
| 568 |   kw = lr->hash_fct (buf, bufact); | 
| 569 |  | 
| 570 |   if (kw != NULL && kw->symname_or_ident == 0) | 
| 571 |     { | 
| 572 |       lr->token.tok = kw->token; | 
| 573 |       free (buf); | 
| 574 |     } | 
| 575 |   else | 
| 576 |     { | 
| 577 |       lr->token.tok = tok_ident; | 
| 578 |  | 
| 579 |       buf = xrealloc (buf, bufact + 1); | 
| 580 |       buf[bufact] = '\0'; | 
| 581 |  | 
| 582 |       lr->token.val.str.startmb = buf; | 
| 583 |       lr->token.val.str.lenmb = bufact; | 
| 584 |     } | 
| 585 |  | 
| 586 |   return &lr->token; | 
| 587 | } | 
| 588 |  | 
| 589 |  | 
| 590 | static struct token * | 
| 591 | get_string (struct linereader *lr, const struct charmap_t *charmap, | 
| 592 | 	    struct localedef_t *locale, const struct repertoire_t *repertoire, | 
| 593 | 	    int verbose) | 
| 594 | { | 
| 595 |   int return_widestr = lr->return_widestr; | 
| 596 |   char *buf; | 
| 597 |   wchar_t *buf2 = NULL; | 
| 598 |   size_t bufact; | 
| 599 |   size_t bufmax = 56; | 
| 600 |  | 
| 601 |   /* We must return two different strings.  */ | 
| 602 |   buf = xmalloc (bufmax); | 
| 603 |   bufact = 0; | 
| 604 |  | 
| 605 |   /* We know it'll be a string.  */ | 
| 606 |   lr->token.tok = tok_string; | 
| 607 |  | 
| 608 |   /* If we need not translate the strings (i.e., expand <...> parts) | 
| 609 |      we can run a simple loop.  */ | 
| 610 |   if (!lr->translate_strings) | 
| 611 |     { | 
| 612 |       int ch; | 
| 613 |  | 
| 614 |       buf2 = NULL; | 
| 615 |       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | 
| 616 | 	ADDC (ch); | 
| 617 |  | 
| 618 |       /* Catch errors with trailing escape character.  */ | 
| 619 |       if (bufact > 0 && buf[bufact - 1] == lr->escape_char | 
| 620 | 	  && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) | 
| 621 | 	{ | 
| 622 | 	  lr_error (lr, _("illegal escape sequence at end of string" )); | 
| 623 | 	  --bufact; | 
| 624 | 	} | 
| 625 |       else if (ch == '\n' || ch == EOF) | 
| 626 | 	lr_error (lr, _("unterminated string" )); | 
| 627 |  | 
| 628 |       ADDC ('\0'); | 
| 629 |     } | 
| 630 |   else | 
| 631 |     { | 
| 632 |       int illegal_string = 0; | 
| 633 |       size_t buf2act = 0; | 
| 634 |       size_t buf2max = 56 * sizeof (uint32_t); | 
| 635 |       int ch; | 
| 636 |  | 
| 637 |       /* We have to provide the wide character result as well.  */ | 
| 638 |       if (return_widestr) | 
| 639 | 	buf2 = xmalloc (buf2max); | 
| 640 |  | 
| 641 |       /* Read until the end of the string (or end of the line or file).  */ | 
| 642 |       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | 
| 643 | 	{ | 
| 644 | 	  size_t startidx; | 
| 645 | 	  uint32_t wch; | 
| 646 | 	  struct charseq *seq; | 
| 647 |  | 
| 648 | 	  if (ch != '<') | 
| 649 | 	    { | 
| 650 | 	      /* The standards leave it up to the implementation to decide | 
| 651 | 		 what to do with character which stand for themself.  We | 
| 652 | 		 could jump through hoops to find out the value relative to | 
| 653 | 		 the charmap and the repertoire map, but instead we leave | 
| 654 | 		 it up to the locale definition author to write a better | 
| 655 | 		 definition.  We assume here that every character which | 
| 656 | 		 stands for itself is encoded using ISO 8859-1.  Using the | 
| 657 | 		 escape character is allowed.  */ | 
| 658 | 	      if (ch == lr->escape_char) | 
| 659 | 		{ | 
| 660 | 		  ch = lr_getc (lr); | 
| 661 | 		  if (ch == '\n' || ch == EOF) | 
| 662 | 		    break; | 
| 663 | 		} | 
| 664 |  | 
| 665 | 	      ADDC (ch); | 
| 666 | 	      if (return_widestr) | 
| 667 | 		ADDWC ((uint32_t) ch); | 
| 668 |  | 
| 669 | 	      continue; | 
| 670 | 	    } | 
| 671 |  | 
| 672 | 	  /* Now we have to search for the end of the symbolic name, i.e., | 
| 673 | 	     the closing '>'.  */ | 
| 674 | 	  startidx = bufact; | 
| 675 | 	  while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) | 
| 676 | 	    { | 
| 677 | 	      if (ch == lr->escape_char) | 
| 678 | 		{ | 
| 679 | 		  ch = lr_getc (lr); | 
| 680 | 		  if (ch == '\n' || ch == EOF) | 
| 681 | 		    break; | 
| 682 | 		} | 
| 683 | 	      ADDC (ch); | 
| 684 | 	    } | 
| 685 | 	  if (ch == '\n' || ch == EOF) | 
| 686 | 	    /* Not a correct string.  */ | 
| 687 | 	    break; | 
| 688 | 	  if (bufact == startidx) | 
| 689 | 	    { | 
| 690 | 	      /* <> is no correct name.  Ignore it and also signal an | 
| 691 | 		 error.  */ | 
| 692 | 	      illegal_string = 1; | 
| 693 | 	      continue; | 
| 694 | 	    } | 
| 695 |  | 
| 696 | 	  /* It might be a Uxxxx symbol.  */ | 
| 697 | 	  if (buf[startidx] == 'U' | 
| 698 | 	      && (bufact - startidx == 5 || bufact - startidx == 9)) | 
| 699 | 	    { | 
| 700 | 	      char *cp = buf + startidx + 1; | 
| 701 | 	      while (cp < &buf[bufact] && isxdigit (*cp)) | 
| 702 | 		++cp; | 
| 703 |  | 
| 704 | 	      if (cp == &buf[bufact]) | 
| 705 | 		{ | 
| 706 | 		  char utmp[10]; | 
| 707 |  | 
| 708 | 		  /* Yes, it is.  */ | 
| 709 | 		  ADDC ('\0'); | 
| 710 | 		  wch = strtoul (buf + startidx + 1, NULL, 16); | 
| 711 |  | 
| 712 | 		  /* Now forget about the name we just added.  */ | 
| 713 | 		  bufact = startidx; | 
| 714 |  | 
| 715 | 		  if (return_widestr) | 
| 716 | 		    ADDWC (wch); | 
| 717 |  | 
| 718 | 		  /* See whether the charmap contains the Uxxxxxxxx names.  */ | 
| 719 | 		  snprintf (utmp, sizeof (utmp), "U%08X" , wch); | 
| 720 | 		  seq = charmap_find_value (charmap, utmp, 9); | 
| 721 |  | 
| 722 | 		  if (seq == NULL) | 
| 723 | 		    { | 
| 724 | 		     /* No, this isn't the case.  Now determine from | 
| 725 | 			the repertoire the name of the character and | 
| 726 | 			find it in the charmap.  */ | 
| 727 | 		      if (repertoire != NULL) | 
| 728 | 			{ | 
| 729 | 			  const char *symbol; | 
| 730 |  | 
| 731 | 			  symbol = repertoire_find_symbol (repertoire, wch); | 
| 732 |  | 
| 733 | 			  if (symbol != NULL) | 
| 734 | 			    seq = charmap_find_value (charmap, symbol, | 
| 735 | 						      strlen (symbol)); | 
| 736 | 			} | 
| 737 |  | 
| 738 | 		      if (seq == NULL) | 
| 739 | 			{ | 
| 740 | #ifndef NO_TRANSLITERATION | 
| 741 | 			  /* Transliterate if possible.  */ | 
| 742 | 			  if (locale != NULL) | 
| 743 | 			    { | 
| 744 | 			      uint32_t *translit; | 
| 745 |  | 
| 746 | 			      if ((locale->avail & CTYPE_LOCALE) == 0) | 
| 747 | 				{ | 
| 748 | 				  /* Load the CTYPE data now.  */ | 
| 749 | 				  int old_needed = locale->needed; | 
| 750 |  | 
| 751 | 				  locale->needed = 0; | 
| 752 | 				  locale = load_locale (LC_CTYPE, | 
| 753 | 							locale->name, | 
| 754 | 							locale->repertoire_name, | 
| 755 | 							charmap, locale); | 
| 756 | 				  locale->needed = old_needed; | 
| 757 | 				} | 
| 758 |  | 
| 759 | 			      if ((locale->avail & CTYPE_LOCALE) != 0 | 
| 760 | 				  && ((translit = find_translit (locale, | 
| 761 | 								 charmap, wch)) | 
| 762 | 				      != NULL)) | 
| 763 | 				/* The CTYPE data contains a matching | 
| 764 | 				   transliteration.  */ | 
| 765 | 				{ | 
| 766 | 				  int i; | 
| 767 |  | 
| 768 | 				  for (i = 0; translit[i] != 0; ++i) | 
| 769 | 				    { | 
| 770 | 				      char utmp[10]; | 
| 771 |  | 
| 772 | 				      snprintf (utmp, sizeof (utmp), "U%08X" , | 
| 773 | 						translit[i]); | 
| 774 | 				      seq = charmap_find_value (charmap, utmp, | 
| 775 | 								9); | 
| 776 | 				      assert (seq != NULL); | 
| 777 | 				      ADDS (seq->bytes, seq->nbytes); | 
| 778 | 				    } | 
| 779 |  | 
| 780 | 				  continue; | 
| 781 | 				} | 
| 782 | 			    } | 
| 783 | #endif	/* NO_TRANSLITERATION */ | 
| 784 |  | 
| 785 | 			  /* Not a known name.  */ | 
| 786 | 			  illegal_string = 1; | 
| 787 | 			} | 
| 788 | 		    } | 
| 789 |  | 
| 790 | 		  if (seq != NULL) | 
| 791 | 		    ADDS (seq->bytes, seq->nbytes); | 
| 792 |  | 
| 793 | 		  continue; | 
| 794 | 		} | 
| 795 | 	    } | 
| 796 |  | 
| 797 | 	  /* We now have the symbolic name in buf[startidx] to | 
| 798 | 	     buf[bufact-1].  Now find out the value for this character | 
| 799 | 	     in the charmap as well as in the repertoire map (in this | 
| 800 | 	     order).  */ | 
| 801 | 	  seq = charmap_find_value (charmap, &buf[startidx], | 
| 802 | 				    bufact - startidx); | 
| 803 |  | 
| 804 | 	  if (seq == NULL) | 
| 805 | 	    { | 
| 806 | 	      /* This name is not in the charmap.  */ | 
| 807 | 	      lr_error (lr, _("symbol `%.*s' not in charmap" ), | 
| 808 | 			(int) (bufact - startidx), &buf[startidx]); | 
| 809 | 	      illegal_string = 1; | 
| 810 | 	    } | 
| 811 |  | 
| 812 | 	  if (return_widestr) | 
| 813 | 	    { | 
| 814 | 	      /* Now the same for the multibyte representation.  */ | 
| 815 | 	      if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) | 
| 816 | 		wch = seq->ucs4; | 
| 817 | 	      else | 
| 818 | 		{ | 
| 819 | 		  wch = repertoire_find_value (repertoire, &buf[startidx], | 
| 820 | 					       bufact - startidx); | 
| 821 | 		  if (seq != NULL) | 
| 822 | 		    seq->ucs4 = wch; | 
| 823 | 		} | 
| 824 |  | 
| 825 | 	      if (wch == ILLEGAL_CHAR_VALUE) | 
| 826 | 		{ | 
| 827 | 		  /* This name is not in the repertoire map.  */ | 
| 828 | 		  lr_error (lr, _("symbol `%.*s' not in repertoire map" ), | 
| 829 | 			    (int) (bufact - startidx), &buf[startidx]); | 
| 830 | 		  illegal_string = 1; | 
| 831 | 		} | 
| 832 | 	      else | 
| 833 | 		ADDWC (wch); | 
| 834 | 	    } | 
| 835 |  | 
| 836 | 	  /* Now forget about the name we just added.  */ | 
| 837 | 	  bufact = startidx; | 
| 838 |  | 
| 839 | 	  /* And copy the bytes.  */ | 
| 840 | 	  if (seq != NULL) | 
| 841 | 	    ADDS (seq->bytes, seq->nbytes); | 
| 842 | 	} | 
| 843 |  | 
| 844 |       if (ch == '\n' || ch == EOF) | 
| 845 | 	{ | 
| 846 | 	  lr_error (lr, _("unterminated string" )); | 
| 847 | 	  illegal_string = 1; | 
| 848 | 	} | 
| 849 |  | 
| 850 |       if (illegal_string) | 
| 851 | 	{ | 
| 852 | 	  free (buf); | 
| 853 | 	  free (buf2); | 
| 854 | 	  lr->token.val.str.startmb = NULL; | 
| 855 | 	  lr->token.val.str.lenmb = 0; | 
| 856 | 	  lr->token.val.str.startwc = NULL; | 
| 857 | 	  lr->token.val.str.lenwc = 0; | 
| 858 |  | 
| 859 | 	  return &lr->token; | 
| 860 | 	} | 
| 861 |  | 
| 862 |       ADDC ('\0'); | 
| 863 |  | 
| 864 |       if (return_widestr) | 
| 865 | 	{ | 
| 866 | 	  ADDWC (0); | 
| 867 | 	  lr->token.val.str.startwc = xrealloc (buf2, | 
| 868 | 						buf2act * sizeof (uint32_t)); | 
| 869 | 	  lr->token.val.str.lenwc = buf2act; | 
| 870 | 	} | 
| 871 |     } | 
| 872 |  | 
| 873 |   lr->token.val.str.startmb = xrealloc (buf, bufact); | 
| 874 |   lr->token.val.str.lenmb = bufact; | 
| 875 |  | 
| 876 |   return &lr->token; | 
| 877 | } | 
| 878 |  |