charmap.c source code [glibc/locale/programs/charmap.c]

1	/ Copyright (C) 1996-2023 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published
6	by the Free Software Foundation; version 2 of the License, or
7	(at your option) any later version.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; if not, see <https://www.gnu.org/licenses/>. /*
16
17	#ifdef HAVE_CONFIG_H
18	# include <config.h>
19	#endif
20
21	#include <ctype.h>
22	#include <errno.h>
23	#include <libintl.h>
24	#include <limits.h>
25	#include <stdio.h>
26	#include <stdlib.h>
27	#include <string.h>
28	#include <stdint.h>
29
30	#include "localedef.h"
31	#include "linereader.h"
32	#include "charmap.h"
33	#include "charmap-dir.h"
34
35	#include <assert.h>
36
37
38	/ Define the lookup function. /
39	#include "charmap-kw.h"
40
41
42	/ Prototypes for local functions. /
43	static struct charmap_t parse_charmap (struct* linereader *cmfile,
44	int verbose, int be_quiet);
45	static void new_width (struct linereader cmfile, struct* charmap_t *result,
46	const char from, const* char *to,
47	unsigned long int width);
48	static void charmap_new_char (struct linereader lr, struct* charmap_t *cm,
49	size_t nbytes, unsigned char *bytes,
50	const char from, const* char *to,
51	int decimal_ellipsis, int step);
52
53
54	bool enc_not_ascii_compatible;
55
56
57	#ifdef NEED_NULL_POINTER
58	static const char *null_pointer;
59	#endif
60
61	static struct linereader *
62	cmlr_open (const char directory, const* char *name, kw_hash_fct_t hf)
63	{
64	FILE *fp;
65
66	fp = charmap_open (directory, name);
67	if (fp == NULL)
68	return NULL;
69	else
70	{
71	size_t dlen = strlen (directory);
72	int add_slash = (dlen == `0` \|\| directory[dlen - `1`] != `'/'`);
73	size_t nlen = strlen (name);
74	char *pathname;
75	char *p;
76
77	pathname = alloca (dlen + add_slash + nlen + `1`);
78	p = stpcpy (pathname, directory);
79	if (add_slash)
80	*p++ = `'/'`;
81	stpcpy (p, name);
82
83	return lr_create (fp, pathname, hf);
84	}
85	}
86
87	struct charmap_t *
88	charmap_read (const char filename, int* verbose, int error_not_found,
89	int be_quiet, int use_default)
90	{
91	struct charmap_t *result = NULL;
92
93	if (filename != NULL)
94	{
95	struct linereader *cmfile;
96
97	/ First try the name as found in the parameter. /
98	cmfile = lr_open (filename, charmap_hash);
99	if (cmfile == NULL)
100	{
101	/ No successful. So start looking through the directories*
102	in the I18NPATH if this is a simple name. /*
103	if (strchr (filename, `'/'`) == NULL)
104	{
105	char *i18npath = getenv ("I18NPATH");
106	if (i18npath != NULL && *i18npath != `'\0'`)
107	{
108	const size_t pathlen = strlen (i18npath);
109	char i18npathbuf[pathlen + `1`];
110	char path[pathlen + sizeof ("/charmaps")];
111	char *next;
112	i18npath = memcpy (i18npathbuf, i18npath, pathlen + `1`);
113
114	while (cmfile == NULL
115	&& (next = strsep (&i18npath, ":")) != NULL)
116	{
117	stpcpy (stpcpy (path, next), "/charmaps");
118	cmfile = cmlr_open (path, filename, charmap_hash);
119
120	if (cmfile == NULL)
121	/ Try without the "/charmaps" part. /
122	cmfile = cmlr_open (next, filename, charmap_hash);
123	}
124	}
125
126	if (cmfile == NULL)
127	/ Try the default directory. /
128	cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
129	}
130	}
131
132	if (cmfile != NULL)
133	result = parse_charmap (cmfile, verbose, be_quiet);
134
135	if (result == NULL && error_not_found)
136	record_error (`0`, errno,
137	_("character map file `%s' not found"),
138	filename);
139	}
140
141	if (result == NULL && filename != NULL && strchr (filename, `'/'`) == NULL)
142	{
143	/ OK, one more try. We also accept the names given to the*
144	character sets in the files. Sometimes they differ from the
145	file name. /*
146	CHARMAP_DIR *dir;
147
148	dir = charmap_opendir (CHARMAP_PATH);
149	if (dir != NULL)
150	{
151	const char *dirent;
152
153	while ((dirent = charmap_readdir (dir)) != NULL)
154	{
155	char **aliases;
156	char **p;
157	int found;
158
159	aliases = charmap_aliases (CHARMAP_PATH, dirent);
160	found = `0`;
161	for (p = aliases; *p; p++)
162	if (strcasecmp (*p, filename) == `0`)
163	{
164	found = `1`;
165	break;
166	}
167	charmap_free_aliases (aliases);
168
169	if (found)
170	{
171	struct linereader *cmfile;
172
173	cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
174	if (cmfile != NULL)
175	result = parse_charmap (cmfile, verbose, be_quiet);
176
177	break;
178	}
179	}
180
181	charmap_closedir (dir);
182	}
183	}
184
185	if (result == NULL && DEFAULT_CHARMAP != NULL)
186	{
187	struct linereader *cmfile;
188
189	cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
190	if (cmfile != NULL)
191	result = parse_charmap (cmfile, verbose, be_quiet);
192
193	if (result == NULL)
194	record_error (`4`, errno,
195	_("default character map file `%s' not found"),
196	DEFAULT_CHARMAP);
197	}
198
199	if (result != NULL && result->code_set_name == NULL)
200	/ The input file does not specify a code set name. This*
201	shouldn't happen but we should cope with it. /*
202	result->code_set_name = basename (filename);
203
204	/ Test of ASCII compatibility of locale encoding.*
205
206	Verify that the encoding to be used in a locale is ASCII compatible,
207	at least for the graphic characters, excluding the control characters,
208	'$' and '@'. This constraint comes from an ISO C 99 restriction.
209
210	ISO C 99 section 7.17.(2) (about wchar_t):
211	the null character shall have the code value zero and each member of
212	the basic character set shall have a code value equal to its value
213	when used as the lone character in an integer character constant.
214	ISO C 99 section 5.2.1.(3):
215	Both the basic source and basic execution character sets shall have
216	the following members: the 26 uppercase letters of the Latin alphabet
217	A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
218	the 26 lowercase letters of the Latin alphabet
219	a b c d e f g h i j k l m n o p q r s t u v w x y z
220	the 10 decimal digits
221	0 1 2 3 4 5 6 7 8 9
222	the following 29 graphic characters
223	! " # % & ' ( ) + , - . / : ; < = > ? [ \ ] ^ _ { \| } ~*
224	the space character, and control characters representing horizontal
225	tab, vertical tab, and form feed.
226
227	Therefore, for all members of the "basic character set", the 'char' code
228	must have the same value as the 'wchar_t' code, which in glibc is the
229	same as the Unicode code, which for all of the enumerated characters
230	is identical to the ASCII code. /*
231	if (result != NULL && use_default)
232	{
233	static const char basic_charset[] =
234	{
235	`'A'`, `'B'`, `'C'`, `'D'`, `'E'`, `'F'`, `'G'`, `'H'`, `'I'`, `'J'`, `'K'`, `'L'`, `'M'`,
236	`'N'`, `'O'`, `'P'`, `'Q'`, `'R'`, `'S'`, `'T'`, `'U'`, `'V'`, `'W'`, `'X'`, `'Y'`, `'Z'`,
237	`'a'`, `'b'`, `'c'`, `'d'`, `'e'`, `'f'`, `'g'`, `'h'`, `'i'`, `'j'`, `'k'`, `'l'`, `'m'`,
238	`'n'`, `'o'`, `'p'`, `'q'`, `'r'`, `'s'`, `'t'`, `'u'`, `'v'`, `'w'`, `'x'`, `'y'`, `'z'`,
239	`'0'`, `'1'`, `'2'`, `'3'`, `'4'`, `'5'`, `'6'`, `'7'`, `'8'`, `'9'`,
240	`'!'`, `'"'`, `'#'`, `'%'`, `'&'`, `'\''`, `'('`, `')'`, `'*'`, `'+'`, `','`, `'-'`,
241	`'.'`, `'/'`, `':'`, `';'`, `'<'`, `'='`, `'>'`, `'?'`, `'['`, `'\\'`, `']'`, `'^'`,
242	`'_'`, `'{'`, `'\|'`, `'}'`, `'~'`, `' '`, `'\t'`, `'\v'`, `'\f'`, `'\0'`
243	};
244	int failed = `0`;
245	const char *p = basic_charset;
246
247	do
248	{
249	struct charseq *seq = charmap_find_symbol (result, p, `1`);
250
251	if (seq == NULL \|\| seq->ucs4 != (uint32_t) *p)
252	failed = `1`;
253	}
254	while (*p++ != `'\0'`);
255
256	if (failed)
257	{
258	/ A user may disable the ASCII compatibility warning check,*
259	but we must remember that the encoding is not ASCII
260	compatible, since it may have other implications. Later
261	we will set _NL_CTYPE_MAP_TO_NONASCII from this value. /*
262	if (warn_ascii)
263	record_warning (_(
264	"character map `%s' is not ASCII compatible, locale not ISO C compliant "
265	"[--no-warnings=ascii]"),
266	result->code_set_name);
267	enc_not_ascii_compatible = true;
268	}
269	}
270
271	return result;
272	}
273
274
275	static struct charmap_t *
276	parse_charmap (struct linereader cmfile, int* verbose, int be_quiet)
277	{
278	struct charmap_t *result;
279	int state;
280	enum token_t expected_tok = tok_error;
281	const char *expected_str = NULL;
282	char *from_name = NULL;
283	char *to_name = NULL;
284	enum token_t ellipsis = `0`;
285	int step = `1`;
286
287	/ We don't want symbolic names in string to be translated. /
288	cmfile->translate_strings = `0`;
289
290	/ Allocate room for result. /
291	result = (struct charmap_t ) xmalloc (sizeof* (struct charmap_t));
292	memset (result, `'\0'`, sizeof (struct charmap_t));
293	/ The default DEFAULT_WIDTH is 1. /
294	result->width_default = `1`;
295
296	#define obstack_chunk_alloc malloc
297	#define obstack_chunk_free free
298	obstack_init (&result->mem_pool);
299
300	if (init_hash (&result->char_table, `256`)
301	\|\| init_hash (&result->byte_table, `256`))
302	{
303	free (result);
304	return NULL;
305	}
306
307	/ We use a state machine to describe the charmap description file*
308	format. /*
309	state = `1`;
310	while (`1`)
311	{
312	/ What's on? /
313	struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
314	enum token_t nowtok = now->tok;
315	struct token *arg;
316
317	if (nowtok == tok_eof)
318	break;
319
320	switch (state)
321	{
322	case `1`:
323	/ The beginning. We expect the special declarations, EOL or*
324	`CHARMAP'. /*
325	if (nowtok == tok_eol)
326	/ Ignore empty lines. /
327	continue;
328
329	if (nowtok == tok_charmap)
330	{
331	from_name = NULL;
332	to_name = NULL;
333
334	/ We have to set up the real work. Fill in some*
335	default values. /*
336	if (result->mb_cur_max == `0`)
337	result->mb_cur_max = `1`;
338	if (result->mb_cur_min == `0`)
339	result->mb_cur_min = result->mb_cur_max;
340	if (result->mb_cur_min > result->mb_cur_max)
341	{
342	record_error (`0`, `0`, _("\
343	%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
344	cmfile->fname);
345
346	result->mb_cur_min = result->mb_cur_max;
347	}
348
349	lr_ignore_rest (cmfile, `1`);
350
351	state = `2`;
352	continue;
353	}
354
355	if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
356	&& nowtok != tok_mb_cur_min && nowtok != tok_escape_char
357	&& nowtok != tok_comment_char && nowtok != tok_g0esc
358	&& nowtok != tok_g1esc && nowtok != tok_g2esc
359	&& nowtok != tok_g3esc && nowtok != tok_repertoiremap
360	&& nowtok != tok_include)
361	{
362	lr_error (cmfile, _("syntax error in prolog: %s"),
363	_("invalid definition"));
364
365	lr_ignore_rest (cmfile, `0`);
366	continue;
367	}
368
369	/ We know that we need an argument. /
370	arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
371
372	switch (nowtok)
373	{
374	case tok_code_set_name:
375	case tok_repertoiremap:
376	if (arg->tok != tok_ident && arg->tok != tok_string)
377	{
378	badarg:
379	lr_error (cmfile, _("syntax error in prolog: %s"),
380	_("bad argument"));
381
382	lr_ignore_rest (cmfile, `0`);
383	continue;
384	}
385
386	if (nowtok == tok_code_set_name)
387	result->code_set_name = obstack_copy0 (&result->mem_pool,
388	arg->val.str.startmb,
389	arg->val.str.lenmb);
390	else
391	result->repertoiremap = obstack_copy0 (&result->mem_pool,
392	arg->val.str.startmb,
393	arg->val.str.lenmb);
394
395	lr_ignore_rest (cmfile, `1`);
396	continue;
397
398	case tok_mb_cur_max:
399	case tok_mb_cur_min:
400	if (arg->tok != tok_number)
401	goto badarg;
402
403	if ((nowtok == tok_mb_cur_max
404	&& result->mb_cur_max != `0`)
405	\|\| (nowtok == tok_mb_cur_max
406	&& result->mb_cur_max != `0`))
407	lr_error (cmfile, _("duplicate definition of <%s>"),
408	nowtok == tok_mb_cur_min
409	? "mb_cur_min" : "mb_cur_max");
410
411	if (arg->val.num < `1`)
412	{
413	lr_error (cmfile,
414	_("value for <%s> must be 1 or greater"),
415	nowtok == tok_mb_cur_min
416	? "mb_cur_min" : "mb_cur_max");
417
418	lr_ignore_rest (cmfile, `0`);
419	continue;
420	}
421	if ((nowtok == tok_mb_cur_max && result->mb_cur_min != `0`
422	&& (int) arg->val.num < result->mb_cur_min)
423	\|\| (nowtok == tok_mb_cur_min && result->mb_cur_max != `0`
424	&& (int) arg->val.num > result->mb_cur_max))
425	{
426	lr_error (cmfile, _("\
427	value of <%s> must be greater or equal than the value of <%s>"),
428	"mb_cur_max", "mb_cur_min");
429
430	lr_ignore_rest (cmfile, `0`);
431	continue;
432	}
433
434	if (nowtok == tok_mb_cur_max)
435	result->mb_cur_max = arg->val.num;
436	else
437	result->mb_cur_min = arg->val.num;
438
439	lr_ignore_rest (cmfile, `1`);
440	continue;
441
442	case tok_escape_char:
443	case tok_comment_char:
444	if (arg->tok != tok_ident)
445	goto badarg;
446
447	if (arg->val.str.lenmb != `1`)
448	{
449	lr_error (cmfile, _("\
450	argument to <%s> must be a single character"),
451	nowtok == tok_escape_char ? "escape_char"
452	: "comment_char");
453
454	lr_ignore_rest (cmfile, `0`);
455	continue;
456	}
457
458	if (nowtok == tok_escape_char)
459	cmfile->escape_char = *arg->val.str.startmb;
460	else
461	cmfile->comment_char = *arg->val.str.startmb;
462
463	lr_ignore_rest (cmfile, `1`);
464	continue;
465
466	case tok_g0esc:
467	case tok_g1esc:
468	case tok_g2esc:
469	case tok_g3esc:
470	case tok_escseq:
471	lr_ignore_rest (cmfile, `0`); / XXX /
472	continue;
473
474	case tok_include:
475	lr_error (cmfile, _("\
476	character sets with locking states are not supported"));
477	exit (`4`);
478
479	default:
480	/ Cannot happen. /
481	assert (! "Should not happen");
482	}
483	break;
484
485	case `2`:
486	/ We have seen `CHARMAP' and now are in the body. Each line*
487	must have the format "%s %s %s\n" or "%s...%s %s %s\n". /*
488	if (nowtok == tok_eol)
489	/ Ignore empty lines. /
490	continue;
491
492	if (nowtok == tok_end)
493	{
494	expected_tok = tok_charmap;
495	expected_str = "CHARMAP";
496	state = `90`;
497	continue;
498	}
499
500	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
501	{
502	lr_error (cmfile, _("syntax error in %s definition: %s"),
503	"CHARMAP", _("no symbolic name given"));
504
505	lr_ignore_rest (cmfile, `0`);
506	continue;
507	}
508
509	/ If the previous line was not completely correct free the*
510	used memory. /*
511	if (from_name != NULL)
512	obstack_free (&result->mem_pool, from_name);
513
514	if (nowtok == tok_bsymbol)
515	from_name = (char *) obstack_copy0 (&result->mem_pool,
516	now->val.str.startmb,
517	now->val.str.lenmb);
518	else
519	{
520	obstack_printf (&result->mem_pool, "U%08X",
521	cmfile->token.val.ucs4);
522	obstack_1grow (&result->mem_pool, `'\0'`);
523	from_name = (char *) obstack_finish (&result->mem_pool);
524	}
525	to_name = NULL;
526
527	state = `3`;
528	continue;
529
530	case `3`:
531	/ We have two possibilities: We can see an ellipsis or an*
532	encoding value. /*
533	if (nowtok == tok_ellipsis3 \|\| nowtok == tok_ellipsis4
534	\|\| nowtok == tok_ellipsis2 \|\| nowtok == tok_ellipsis4_2
535	\|\| nowtok == tok_ellipsis2_2)
536	{
537	ellipsis = nowtok;
538	if (nowtok == tok_ellipsis4_2)
539	{
540	step = `2`;
541	nowtok = tok_ellipsis4;
542	}
543	else if (nowtok == tok_ellipsis2_2)
544	{
545	step = `2`;
546	nowtok = tok_ellipsis2;
547	}
548	state = `4`;
549	continue;
550	}
551	/ FALLTHROUGH /
552
553	case `5`:
554	if (nowtok != tok_charcode)
555	{
556	lr_error (cmfile, _("syntax error in %s definition: %s"),
557	"CHARMAP", _("invalid encoding given"));
558
559	lr_ignore_rest (cmfile, `0`);
560
561	state = `2`;
562	continue;
563	}
564
565	if (now->val.charcode.nbytes < result->mb_cur_min)
566	lr_error (cmfile, _("too few bytes in character encoding"));
567	else if (now->val.charcode.nbytes > result->mb_cur_max)
568	lr_error (cmfile, _("too many bytes in character encoding"));
569	else
570	charmap_new_char (cmfile, result, now->val.charcode.nbytes,
571	now->val.charcode.bytes, from_name, to_name,
572	ellipsis != tok_ellipsis2, step);
573
574	/ Ignore trailing comment silently. /
575	lr_ignore_rest (cmfile, `0`);
576
577	from_name = NULL;
578	to_name = NULL;
579	ellipsis = tok_none;
580	step = `1`;
581
582	state = `2`;
583	continue;
584
585	case `4`:
586	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
587	{
588	lr_error (cmfile, _("syntax error in %s definition: %s"),
589	"CHARMAP",
590	_("no symbolic name given for end of range"));
591
592	lr_ignore_rest (cmfile, `0`);
593	continue;
594	}
595
596	/ Copy the to-name in a safe place. /
597	if (nowtok == tok_bsymbol)
598	to_name = (char *) obstack_copy0 (&result->mem_pool,
599	cmfile->token.val.str.startmb,
600	cmfile->token.val.str.lenmb);
601	else
602	{
603	obstack_printf (&result->mem_pool, "U%08X",
604	cmfile->token.val.ucs4);
605	obstack_1grow (&result->mem_pool, `'\0'`);
606	to_name = (char *) obstack_finish (&result->mem_pool);
607	}
608
609	state = `5`;
610	continue;
611
612	case `90`:
613	if (nowtok != expected_tok)
614	lr_error (cmfile, _("\
615	%1$s: definition does not end with `END %1$s'"), expected_str);
616
617	lr_ignore_rest (cmfile, nowtok == expected_tok);
618	state = `91`;
619	continue;
620
621	case `91`:
622	/ Waiting for WIDTH... /
623	if (nowtok == tok_eol)
624	/ Ignore empty lines. /
625	continue;
626
627	if (nowtok == tok_width_default)
628	{
629	state = `92`;
630	continue;
631	}
632
633	if (nowtok == tok_width)
634	{
635	lr_ignore_rest (cmfile, `1`);
636	state = `93`;
637	continue;
638	}
639
640	if (nowtok == tok_width_variable)
641	{
642	lr_ignore_rest (cmfile, `1`);
643	state = `98`;
644	continue;
645	}
646
647	lr_error (cmfile, _("\
648	only WIDTH definitions are allowed to follow the CHARMAP definition"));
649
650	lr_ignore_rest (cmfile, `0`);
651	continue;
652
653	case `92`:
654	if (nowtok != tok_number)
655	lr_error (cmfile, _("value for %s must be an integer"),
656	"WIDTH_DEFAULT");
657	else
658	result->width_default = now->val.num;
659
660	lr_ignore_rest (cmfile, nowtok == tok_number);
661
662	state = `91`;
663	continue;
664
665	case `93`:
666	/ We now expect `END WIDTH' or lines of the format "%s %d\n" or*
667	"%s...%s %d\n". /*
668	if (nowtok == tok_eol)
669	/ ignore empty lines. /
670	continue;
671
672	if (nowtok == tok_end)
673	{
674	expected_tok = tok_width;
675	expected_str = "WIDTH";
676	state = `90`;
677	continue;
678	}
679
680	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
681	{
682	lr_error (cmfile, _("syntax error in %s definition: %s"),
683	"WIDTH", _("no symbolic name given"));
684
685	lr_ignore_rest (cmfile, `0`);
686	continue;
687	}
688
689	if (from_name != NULL)
690	obstack_free (&result->mem_pool, from_name);
691
692	if (nowtok == tok_bsymbol)
693	from_name = (char *) obstack_copy0 (&result->mem_pool,
694	now->val.str.startmb,
695	now->val.str.lenmb);
696	else
697	{
698	obstack_printf (&result->mem_pool, "U%08X",
699	cmfile->token.val.ucs4);
700	obstack_1grow (&result->mem_pool, `'\0'`);
701	from_name = (char *) obstack_finish (&result->mem_pool);
702	}
703
704	to_name = NULL;
705
706	state = `94`;
707	continue;
708
709	case `94`:
710	if (nowtok == tok_ellipsis3)
711	{
712	state = `95`;
713	continue;
714	}
715	/ Fall through. /
716
717	case `96`:
718	if (nowtok != tok_number)
719	lr_error (cmfile, _("value for %s must be an integer"),
720	"WIDTH");
721	else
722	{
723	/ Store width for chars. /
724	new_width (cmfile, result, from_name, to_name, now->val.num);
725
726	from_name = NULL;
727	to_name = NULL;
728	}
729
730	lr_ignore_rest (cmfile, nowtok == tok_number);
731
732	state = `93`;
733	continue;
734
735	case `95`:
736	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
737	{
738	lr_error (cmfile, _("syntax error in %s definition: %s"),
739	"WIDTH", _("no symbolic name given for end of range"));
740
741	lr_ignore_rest (cmfile, `0`);
742
743	state = `93`;
744	continue;
745	}
746
747	if (nowtok == tok_bsymbol)
748	to_name = (char *) obstack_copy0 (&result->mem_pool,
749	now->val.str.startmb,
750	now->val.str.lenmb);
751	else
752	{
753	obstack_printf (&result->mem_pool, "U%08X",
754	cmfile->token.val.ucs4);
755	obstack_1grow (&result->mem_pool, `'\0'`);
756	to_name = (char *) obstack_finish (&result->mem_pool);
757	}
758
759	state = `96`;
760	continue;
761
762	case `98`:
763	/ We now expect `END WIDTH_VARIABLE' or lines of the format*
764	"%s\n" or "%s...%s\n". /*
765	if (nowtok == tok_eol)
766	/ ignore empty lines. /
767	continue;
768
769	if (nowtok == tok_end)
770	{
771	expected_tok = tok_width_variable;
772	expected_str = "WIDTH_VARIABLE";
773	state = `90`;
774	continue;
775	}
776
777	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
778	{
779	lr_error (cmfile, _("syntax error in %s definition: %s"),
780	"WIDTH_VARIABLE", _("no symbolic name given"));
781
782	lr_ignore_rest (cmfile, `0`);
783
784	continue;
785	}
786
787	if (from_name != NULL)
788	obstack_free (&result->mem_pool, from_name);
789
790	if (nowtok == tok_bsymbol)
791	from_name = (char *) obstack_copy0 (&result->mem_pool,
792	now->val.str.startmb,
793	now->val.str.lenmb);
794	else
795	{
796	obstack_printf (&result->mem_pool, "U%08X",
797	cmfile->token.val.ucs4);
798	obstack_1grow (&result->mem_pool, `'\0'`);
799	from_name = (char *) obstack_finish (&result->mem_pool);
800	}
801	to_name = NULL;
802
803	state = `99`;
804	continue;
805
806	case `99`:
807	if (nowtok == tok_ellipsis3)
808	state = `100`;
809
810	/ Store info. /
811	from_name = NULL;
812
813	/ Warn /
814	state = `98`;
815	continue;
816
817	case `100`:
818	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
819	{
820	lr_error (cmfile, _("syntax error in %s definition: %s"),
821	"WIDTH_VARIABLE",
822	_("no symbolic name given for end of range"));
823	lr_ignore_rest (cmfile, `0`);
824	continue;
825	}
826
827	if (nowtok == tok_bsymbol)
828	to_name = (char *) obstack_copy0 (&result->mem_pool,
829	now->val.str.startmb,
830	now->val.str.lenmb);
831	else
832	{
833	obstack_printf (&result->mem_pool, "U%08X",
834	cmfile->token.val.ucs4);
835	obstack_1grow (&result->mem_pool, `'\0'`);
836	to_name = (char *) obstack_finish (&result->mem_pool);
837	}
838
839	/ XXX Enter value into table. /
840
841	lr_ignore_rest (cmfile, `1`);
842
843	state = `98`;
844	continue;
845
846	default:
847	record_error (`5`, `0`, _("%s: error in state machine"),
848	__FILE__);
849	/ NOTREACHED /
850	}
851	break;
852	}
853
854	if (state != `91`)
855	record_error (`0`, `0`, _("%s: premature end of file"),
856	cmfile->fname);
857
858	lr_close (cmfile);
859
860	return result;
861	}
862
863
864	static void
865	new_width (struct linereader cmfile, struct* charmap_t *result,
866	const char from, const* char to, unsigned* long int width)
867	{
868	struct charseq *from_val;
869	struct charseq *to_val;
870
871	from_val = charmap_find_value (result, from, strlen (from));
872	if (from_val == NULL)
873	{
874	lr_error (cmfile, _("unknown character `%s'"), from);
875	return;
876	}
877
878	if (to == NULL)
879	to_val = from_val;
880	else
881	{
882	to_val = charmap_find_value (result, to, strlen (to));
883	if (to_val == NULL)
884	{
885	lr_error (cmfile, _("unknown character `%s'"), to);
886	return;
887	}
888
889	/ Make sure the number of bytes for the end points of the range*
890	is correct. /*
891	if (from_val->nbytes != to_val->nbytes)
892	{
893	lr_error (cmfile, _("\
894	number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
895	from_val->nbytes, to_val->nbytes);
896	return;
897	}
898	}
899
900	if (result->nwidth_rules >= result->nwidth_rules_max)
901	{
902	size_t new_size = result->nwidth_rules + `32`;
903	struct width_rule *new_rules =
904	(struct width_rule *) obstack_alloc (&result->mem_pool,
905	(new_size
906	* sizeof (struct width_rule)));
907
908	memcpy (new_rules, result->width_rules,
909	result->nwidth_rules_max * sizeof (struct width_rule));
910
911	result->width_rules = new_rules;
912	result->nwidth_rules_max = new_size;
913	}
914
915	result->width_rules[result->nwidth_rules].from = from_val;
916	result->width_rules[result->nwidth_rules].to = to_val;
917	result->width_rules[result->nwidth_rules].width = (unsigned int) width;
918	++result->nwidth_rules;
919	}
920
921
922	struct charseq *
923	charmap_find_value (const struct charmap_t cm, const* char *name, size_t len)
924	{
925	void *result;
926
927	return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
928	< `0` ? NULL : (struct charseq *) result);
929	}
930
931
932	static void
933	charmap_new_char (struct linereader lr, struct* charmap_t *cm,
934	size_t nbytes, unsigned char *bytes,
935	const char from, const* char *to,
936	int decimal_ellipsis, int step)
937	{
938	hash_table *ht = &cm->char_table;
939	hash_table *bt = &cm->byte_table;
940	struct obstack *ob = &cm->mem_pool;
941	char *from_end;
942	char *to_end;
943	const char *cp;
944	int prefix_len, len1, len2;
945	unsigned int from_nr, to_nr, cnt;
946	struct charseq *newp;
947
948	len1 = strlen (from);
949
950	if (to == NULL)
951	{
952	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
953	newp->nbytes = nbytes;
954	memcpy (newp->bytes, bytes, nbytes);
955	newp->name = from;
956
957	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
958	if ((from[`0`] == `'U'` \|\| from[`0`] == `'P'`) && (len1 == `5` \|\| len1 == `9`))
959	{
960	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
961	xxxx and xxxxxxxx are hexadecimal numbers. In this case
962	we use the value of xxxx or xxxxxxxx as the UCS4 value of
963	this character and we don't have to consult the repertoire
964	map.
965
966	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
967	and xxxxxxxx also give the code point in UCS4 but this must
968	be in the private, i.e., unassigned, area. This should be
969	used for characters which do not (yet) have an equivalent
970	in ISO 10646 and Unicode. /*
971	char *endp;
972
973	errno = `0`;
974	newp->ucs4 = strtoul (from + `1`, &endp, `16`);
975	if (endp - from != len1
976	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
977	\|\| newp->ucs4 >= `0x80000000`)
978	/ This wasn't successful. Signal this name cannot be a*
979	correct UCS value. /*
980	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
981	}
982
983	insert_entry (ht, from, len1, newp);
984	insert_entry (bt, newp->bytes, nbytes, newp);
985	/ Please note that it isn't a bug if a symbol is defined more*
986	than once. All later definitions are simply discarded. /*
987	return;
988	}
989
990	/ We have a range: the names must have names with equal prefixes*
991	and an equal number of digits, where the second number is greater
992	or equal than the first. /*
993	len2 = strlen (to);
994
995	if (len1 != len2)
996	{
997	illegal_range:
998	lr_error (lr, _("invalid names for character range"));
999	return;
1000	}
1001
1002	cp = &from[len1 - `1`];
1003	if (decimal_ellipsis)
1004	while (isdigit (*cp) && cp >= from)
1005	--cp;
1006	else
1007	while (isxdigit (*cp) && cp >= from)
1008	{
1009	if (!isdigit (cp) && !isupper (cp))
1010	lr_error (lr, _("\
1011	hexadecimal range format should use only capital characters"));
1012	--cp;
1013	}
1014
1015	prefix_len = (cp - from) + `1`;
1016
1017	if (cp == &from[len1 - `1`] \|\| strncmp (from, to, prefix_len) != `0`)
1018	goto illegal_range;
1019
1020	errno = `0`;
1021	from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? `10` : `16`);
1022	if (*from_end != `'\0'` \|\| (from_nr == UINT_MAX && errno == ERANGE)
1023	\|\| ((to_nr = strtoul (&to[prefix_len], &to_end,
1024	decimal_ellipsis ? `10` : `16`)) == UINT_MAX
1025	&& errno == ERANGE)
1026	\|\| *to_end != `'\0'`)
1027	{
1028	lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1029	return;
1030	}
1031
1032	if (from_nr > to_nr)
1033	{
1034	lr_error (lr, _("upper limit in range is smaller than lower limit"));
1035	return;
1036	}
1037
1038	for (cnt = from_nr; cnt <= to_nr; cnt += step)
1039	{
1040	char *name_end;
1041	obstack_printf (ob, decimal_ellipsis ? "%.s%0d" : "%.s%0X",
1042	prefix_len, from, len1 - prefix_len, cnt);
1043	obstack_1grow (ob, `'\0'`);
1044	name_end = obstack_finish (ob);
1045
1046	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
1047	newp->nbytes = nbytes;
1048	memcpy (newp->bytes, bytes, nbytes);
1049	newp->name = name_end;
1050
1051	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1052	if ((name_end[`0`] == `'U'` \|\| name_end[`0`] == `'P'`)
1053	&& (len1 == `5` \|\| len1 == `9`))
1054	{
1055	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
1056	xxxx and xxxxxxxx are hexadecimal numbers. In this case
1057	we use the value of xxxx or xxxxxxxx as the UCS4 value of
1058	this character and we don't have to consult the repertoire
1059	map.
1060
1061	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1062	and xxxxxxxx also give the code point in UCS4 but this must
1063	be in the private, i.e., unassigned, area. This should be
1064	used for characters which do not (yet) have an equivalent
1065	in ISO 10646 and Unicode. /*
1066	char *endp;
1067
1068	errno = `0`;
1069	newp->ucs4 = strtoul (name_end + `1`, &endp, `16`);
1070	if (endp - name_end != len1
1071	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
1072	\|\| newp->ucs4 >= `0x80000000`)
1073	/ This wasn't successful. Signal this name cannot be a*
1074	correct UCS value. /*
1075	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1076	}
1077
1078	insert_entry (ht, name_end, len1, newp);
1079	insert_entry (bt, newp->bytes, nbytes, newp);
1080	/ Please note we don't examine the return value since it is no error*
1081	if we have two definitions for a symbol. /*
1082
1083	/ Increment the value in the byte sequence. /
1084	if (++bytes[nbytes - `1`] == `'\0'`)
1085	{
1086	int b = nbytes - `2`;
1087
1088	do
1089	if (b < `0`)
1090	{
1091	lr_error (lr,
1092	_("resulting bytes for range not representable."));
1093	return;
1094	}
1095	while (++bytes[b--] == `0`);
1096	}
1097	}
1098	}
1099
1100
1101	struct charseq *
1102	charmap_find_symbol (const struct charmap_t cm, const* char *bytes,
1103	size_t nbytes)
1104	{
1105	void *result;
1106
1107	return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1108	< `0` ? NULL : (struct charseq *) result);
1109	}
1110

Browse the source code of glibc/locale/programs/charmap.c