charmap.c source code [glibc/locale/programs/charmap.c]

1	/ Copyright (C) 1996-2019 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3	Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published
7	by the Free Software Foundation; version 2 of the License, or
8	(at your option) any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, see <http://www.gnu.org/licenses/>. /*
17
18	#ifdef HAVE_CONFIG_H
19	# include <config.h>
20	#endif
21
22	#include <ctype.h>
23	#include <errno.h>
24	#include <libintl.h>
25	#include <limits.h>
26	#include <stdio.h>
27	#include <stdlib.h>
28	#include <string.h>
29	#include <stdint.h>
30
31	#include "localedef.h"
32	#include "linereader.h"
33	#include "charmap.h"
34	#include "charmap-dir.h"
35
36	#include <assert.h>
37
38
39	/ Define the lookup function. /
40	#include "charmap-kw.h"
41
42
43	/ Prototypes for local functions. /
44	static struct charmap_t parse_charmap (struct* linereader *cmfile,
45	int verbose, int be_quiet);
46	static void new_width (struct linereader cmfile, struct* charmap_t *result,
47	const char from, const* char *to,
48	unsigned long int width);
49	static void charmap_new_char (struct linereader lr, struct* charmap_t *cm,
50	size_t nbytes, unsigned char *bytes,
51	const char from, const* char *to,
52	int decimal_ellipsis, int step);
53
54
55	bool enc_not_ascii_compatible;
56
57
58	#ifdef NEED_NULL_POINTER
59	static const char *null_pointer;
60	#endif
61
62	static struct linereader *
63	cmlr_open (const char directory, const* char *name, kw_hash_fct_t hf)
64	{
65	FILE *fp;
66
67	fp = charmap_open (directory, name);
68	if (fp == NULL)
69	return NULL;
70	else
71	{
72	size_t dlen = strlen (directory);
73	int add_slash = (dlen == `0` \|\| directory[dlen - `1`] != `'/'`);
74	size_t nlen = strlen (name);
75	char *pathname;
76	char *p;
77
78	pathname = alloca (dlen + add_slash + nlen + `1`);
79	p = stpcpy (pathname, directory);
80	if (add_slash)
81	*p++ = `'/'`;
82	stpcpy (p, name);
83
84	return lr_create (fp, pathname, hf);
85	}
86	}
87
88	struct charmap_t *
89	charmap_read (const char filename, int* verbose, int error_not_found,
90	int be_quiet, int use_default)
91	{
92	struct charmap_t *result = NULL;
93
94	if (filename != NULL)
95	{
96	struct linereader *cmfile;
97
98	/ First try the name as found in the parameter. /
99	cmfile = lr_open (filename, charmap_hash);
100	if (cmfile == NULL)
101	{
102	/ No successful. So start looking through the directories*
103	in the I18NPATH if this is a simple name. /*
104	if (strchr (filename, `'/'`) == NULL)
105	{
106	char *i18npath = getenv ("I18NPATH");
107	if (i18npath != NULL && *i18npath != `'\0'`)
108	{
109	const size_t pathlen = strlen (i18npath);
110	char i18npathbuf[pathlen + `1`];
111	char path[pathlen + sizeof ("/charmaps")];
112	char *next;
113	i18npath = memcpy (i18npathbuf, i18npath, pathlen + `1`);
114
115	while (cmfile == NULL
116	&& (next = strsep (&i18npath, ":")) != NULL)
117	{
118	stpcpy (stpcpy (path, next), "/charmaps");
119	cmfile = cmlr_open (path, filename, charmap_hash);
120
121	if (cmfile == NULL)
122	/ Try without the "/charmaps" part. /
123	cmfile = cmlr_open (next, filename, charmap_hash);
124	}
125	}
126
127	if (cmfile == NULL)
128	/ Try the default directory. /
129	cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
130	}
131	}
132
133	if (cmfile != NULL)
134	result = parse_charmap (cmfile, verbose, be_quiet);
135
136	if (result == NULL && error_not_found)
137	record_error (`0`, errno,
138	_("character map file `%s' not found"),
139	filename);
140	}
141
142	if (result == NULL && filename != NULL && strchr (filename, `'/'`) == NULL)
143	{
144	/ OK, one more try. We also accept the names given to the*
145	character sets in the files. Sometimes they differ from the
146	file name. /*
147	CHARMAP_DIR *dir;
148
149	dir = charmap_opendir (CHARMAP_PATH);
150	if (dir != NULL)
151	{
152	const char *dirent;
153
154	while ((dirent = charmap_readdir (dir)) != NULL)
155	{
156	char **aliases;
157	char **p;
158	int found;
159
160	aliases = charmap_aliases (CHARMAP_PATH, dirent);
161	found = `0`;
162	for (p = aliases; *p; p++)
163	if (strcasecmp (*p, filename) == `0`)
164	{
165	found = `1`;
166	break;
167	}
168	charmap_free_aliases (aliases);
169
170	if (found)
171	{
172	struct linereader *cmfile;
173
174	cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
175	if (cmfile != NULL)
176	result = parse_charmap (cmfile, verbose, be_quiet);
177
178	break;
179	}
180	}
181
182	charmap_closedir (dir);
183	}
184	}
185
186	if (result == NULL && DEFAULT_CHARMAP != NULL)
187	{
188	struct linereader *cmfile;
189
190	cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
191	if (cmfile != NULL)
192	result = parse_charmap (cmfile, verbose, be_quiet);
193
194	if (result == NULL)
195	record_error (`4`, errno,
196	_("default character map file `%s' not found"),
197	DEFAULT_CHARMAP);
198	}
199
200	if (result != NULL && result->code_set_name == NULL)
201	/ The input file does not specify a code set name. This*
202	shouldn't happen but we should cope with it. /*
203	result->code_set_name = basename (filename);
204
205	/ Test of ASCII compatibility of locale encoding.*
206
207	Verify that the encoding to be used in a locale is ASCII compatible,
208	at least for the graphic characters, excluding the control characters,
209	'$' and '@'. This constraint comes from an ISO C 99 restriction.
210
211	ISO C 99 section 7.17.(2) (about wchar_t):
212	the null character shall have the code value zero and each member of
213	the basic character set shall have a code value equal to its value
214	when used as the lone character in an integer character constant.
215	ISO C 99 section 5.2.1.(3):
216	Both the basic source and basic execution character sets shall have
217	the following members: the 26 uppercase letters of the Latin alphabet
218	A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
219	the 26 lowercase letters of the Latin alphabet
220	a b c d e f g h i j k l m n o p q r s t u v w x y z
221	the 10 decimal digits
222	0 1 2 3 4 5 6 7 8 9
223	the following 29 graphic characters
224	! " # % & ' ( ) + , - . / : ; < = > ? [ \ ] ^ _ { \| } ~*
225	the space character, and control characters representing horizontal
226	tab, vertical tab, and form feed.
227
228	Therefore, for all members of the "basic character set", the 'char' code
229	must have the same value as the 'wchar_t' code, which in glibc is the
230	same as the Unicode code, which for all of the enumerated characters
231	is identical to the ASCII code. /*
232	if (result != NULL && use_default)
233	{
234	static const char basic_charset[] =
235	{
236	`'A'`, `'B'`, `'C'`, `'D'`, `'E'`, `'F'`, `'G'`, `'H'`, `'I'`, `'J'`, `'K'`, `'L'`, `'M'`,
237	`'N'`, `'O'`, `'P'`, `'Q'`, `'R'`, `'S'`, `'T'`, `'U'`, `'V'`, `'W'`, `'X'`, `'Y'`, `'Z'`,
238	`'a'`, `'b'`, `'c'`, `'d'`, `'e'`, `'f'`, `'g'`, `'h'`, `'i'`, `'j'`, `'k'`, `'l'`, `'m'`,
239	`'n'`, `'o'`, `'p'`, `'q'`, `'r'`, `'s'`, `'t'`, `'u'`, `'v'`, `'w'`, `'x'`, `'y'`, `'z'`,
240	`'0'`, `'1'`, `'2'`, `'3'`, `'4'`, `'5'`, `'6'`, `'7'`, `'8'`, `'9'`,
241	`'!'`, `'"'`, `'#'`, `'%'`, `'&'`, `'\''`, `'('`, `')'`, `'*'`, `'+'`, `','`, `'-'`,
242	`'.'`, `'/'`, `':'`, `';'`, `'<'`, `'='`, `'>'`, `'?'`, `'['`, `'\\'`, `']'`, `'^'`,
243	`'_'`, `'{'`, `'\|'`, `'}'`, `'~'`, `' '`, `'\t'`, `'\v'`, `'\f'`, `'\0'`
244	};
245	int failed = `0`;
246	const char *p = basic_charset;
247
248	do
249	{
250	struct charseq *seq = charmap_find_symbol (result, p, `1`);
251
252	if (seq == NULL \|\| seq->ucs4 != (uint32_t) *p)
253	failed = `1`;
254	}
255	while (*p++ != `'\0'`);
256
257	if (failed)
258	{
259	/ A user may disable the ASCII compatibility warning check,*
260	but we must remember that the encoding is not ASCII
261	compatible, since it may have other implications. Later
262	we will set _NL_CTYPE_MAP_TO_NONASCII from this value. /*
263	if (warn_ascii)
264	record_warning (_(
265	"character map `%s' is not ASCII compatible, locale not ISO C compliant "
266	"[--no-warnings=ascii]"),
267	result->code_set_name);
268	enc_not_ascii_compatible = true;
269	}
270	}
271
272	return result;
273	}
274
275
276	static struct charmap_t *
277	parse_charmap (struct linereader cmfile, int* verbose, int be_quiet)
278	{
279	struct charmap_t *result;
280	int state;
281	enum token_t expected_tok = tok_error;
282	const char *expected_str = NULL;
283	char *from_name = NULL;
284	char *to_name = NULL;
285	enum token_t ellipsis = `0`;
286	int step = `1`;
287
288	/ We don't want symbolic names in string to be translated. /
289	cmfile->translate_strings = `0`;
290
291	/ Allocate room for result. /
292	result = (struct charmap_t ) xmalloc (sizeof* (struct charmap_t));
293	memset (result, `'\0'`, sizeof (struct charmap_t));
294	/ The default DEFAULT_WIDTH is 1. /
295	result->width_default = `1`;
296
297	#define obstack_chunk_alloc malloc
298	#define obstack_chunk_free free
299	obstack_init (&result->mem_pool);
300
301	if (init_hash (&result->char_table, `256`)
302	\|\| init_hash (&result->byte_table, `256`))
303	{
304	free (result);
305	return NULL;
306	}
307
308	/ We use a state machine to describe the charmap description file*
309	format. /*
310	state = `1`;
311	while (`1`)
312	{
313	/ What's on? /
314	struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
315	enum token_t nowtok = now->tok;
316	struct token *arg;
317
318	if (nowtok == tok_eof)
319	break;
320
321	switch (state)
322	{
323	case `1`:
324	/ The beginning. We expect the special declarations, EOL or*
325	`CHARMAP'. /*
326	if (nowtok == tok_eol)
327	/ Ignore empty lines. /
328	continue;
329
330	if (nowtok == tok_charmap)
331	{
332	from_name = NULL;
333	to_name = NULL;
334
335	/ We have to set up the real work. Fill in some*
336	default values. /*
337	if (result->mb_cur_max == `0`)
338	result->mb_cur_max = `1`;
339	if (result->mb_cur_min == `0`)
340	result->mb_cur_min = result->mb_cur_max;
341	if (result->mb_cur_min > result->mb_cur_max)
342	{
343	record_error (`0`, `0`, _("\
344	%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
345	cmfile->fname);
346
347	result->mb_cur_min = result->mb_cur_max;
348	}
349
350	lr_ignore_rest (cmfile, `1`);
351
352	state = `2`;
353	continue;
354	}
355
356	if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
357	&& nowtok != tok_mb_cur_min && nowtok != tok_escape_char
358	&& nowtok != tok_comment_char && nowtok != tok_g0esc
359	&& nowtok != tok_g1esc && nowtok != tok_g2esc
360	&& nowtok != tok_g3esc && nowtok != tok_repertoiremap
361	&& nowtok != tok_include)
362	{
363	lr_error (cmfile, _("syntax error in prolog: %s"),
364	_("invalid definition"));
365
366	lr_ignore_rest (cmfile, `0`);
367	continue;
368	}
369
370	/ We know that we need an argument. /
371	arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
372
373	switch (nowtok)
374	{
375	case tok_code_set_name:
376	case tok_repertoiremap:
377	if (arg->tok != tok_ident && arg->tok != tok_string)
378	{
379	badarg:
380	lr_error (cmfile, _("syntax error in prolog: %s"),
381	_("bad argument"));
382
383	lr_ignore_rest (cmfile, `0`);
384	continue;
385	}
386
387	if (nowtok == tok_code_set_name)
388	result->code_set_name = obstack_copy0 (&result->mem_pool,
389	arg->val.str.startmb,
390	arg->val.str.lenmb);
391	else
392	result->repertoiremap = obstack_copy0 (&result->mem_pool,
393	arg->val.str.startmb,
394	arg->val.str.lenmb);
395
396	lr_ignore_rest (cmfile, `1`);
397	continue;
398
399	case tok_mb_cur_max:
400	case tok_mb_cur_min:
401	if (arg->tok != tok_number)
402	goto badarg;
403
404	if ((nowtok == tok_mb_cur_max
405	&& result->mb_cur_max != `0`)
406	\|\| (nowtok == tok_mb_cur_max
407	&& result->mb_cur_max != `0`))
408	lr_error (cmfile, _("duplicate definition of <%s>"),
409	nowtok == tok_mb_cur_min
410	? "mb_cur_min" : "mb_cur_max");
411
412	if (arg->val.num < `1`)
413	{
414	lr_error (cmfile,
415	_("value for <%s> must be 1 or greater"),
416	nowtok == tok_mb_cur_min
417	? "mb_cur_min" : "mb_cur_max");
418
419	lr_ignore_rest (cmfile, `0`);
420	continue;
421	}
422	if ((nowtok == tok_mb_cur_max && result->mb_cur_min != `0`
423	&& (int) arg->val.num < result->mb_cur_min)
424	\|\| (nowtok == tok_mb_cur_min && result->mb_cur_max != `0`
425	&& (int) arg->val.num > result->mb_cur_max))
426	{
427	lr_error (cmfile, _("\
428	value of <%s> must be greater or equal than the value of <%s>"),
429	"mb_cur_max", "mb_cur_min");
430
431	lr_ignore_rest (cmfile, `0`);
432	continue;
433	}
434
435	if (nowtok == tok_mb_cur_max)
436	result->mb_cur_max = arg->val.num;
437	else
438	result->mb_cur_min = arg->val.num;
439
440	lr_ignore_rest (cmfile, `1`);
441	continue;
442
443	case tok_escape_char:
444	case tok_comment_char:
445	if (arg->tok != tok_ident)
446	goto badarg;
447
448	if (arg->val.str.lenmb != `1`)
449	{
450	lr_error (cmfile, _("\
451	argument to <%s> must be a single character"),
452	nowtok == tok_escape_char ? "escape_char"
453	: "comment_char");
454
455	lr_ignore_rest (cmfile, `0`);
456	continue;
457	}
458
459	if (nowtok == tok_escape_char)
460	cmfile->escape_char = *arg->val.str.startmb;
461	else
462	cmfile->comment_char = *arg->val.str.startmb;
463
464	lr_ignore_rest (cmfile, `1`);
465	continue;
466
467	case tok_g0esc:
468	case tok_g1esc:
469	case tok_g2esc:
470	case tok_g3esc:
471	case tok_escseq:
472	lr_ignore_rest (cmfile, `0`); / XXX /
473	continue;
474
475	case tok_include:
476	lr_error (cmfile, _("\
477	character sets with locking states are not supported"));
478	exit (`4`);
479
480	default:
481	/ Cannot happen. /
482	assert (! "Should not happen");
483	}
484	break;
485
486	case `2`:
487	/ We have seen `CHARMAP' and now are in the body. Each line*
488	must have the format "%s %s %s\n" or "%s...%s %s %s\n". /*
489	if (nowtok == tok_eol)
490	/ Ignore empty lines. /
491	continue;
492
493	if (nowtok == tok_end)
494	{
495	expected_tok = tok_charmap;
496	expected_str = "CHARMAP";
497	state = `90`;
498	continue;
499	}
500
501	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
502	{
503	lr_error (cmfile, _("syntax error in %s definition: %s"),
504	"CHARMAP", _("no symbolic name given"));
505
506	lr_ignore_rest (cmfile, `0`);
507	continue;
508	}
509
510	/ If the previous line was not completely correct free the*
511	used memory. /*
512	if (from_name != NULL)
513	obstack_free (&result->mem_pool, from_name);
514
515	if (nowtok == tok_bsymbol)
516	from_name = (char *) obstack_copy0 (&result->mem_pool,
517	now->val.str.startmb,
518	now->val.str.lenmb);
519	else
520	{
521	obstack_printf (&result->mem_pool, "U%08X",
522	cmfile->token.val.ucs4);
523	obstack_1grow (&result->mem_pool, `'\0'`);
524	from_name = (char *) obstack_finish (&result->mem_pool);
525	}
526	to_name = NULL;
527
528	state = `3`;
529	continue;
530
531	case `3`:
532	/ We have two possibilities: We can see an ellipsis or an*
533	encoding value. /*
534	if (nowtok == tok_ellipsis3 \|\| nowtok == tok_ellipsis4
535	\|\| nowtok == tok_ellipsis2 \|\| nowtok == tok_ellipsis4_2
536	\|\| nowtok == tok_ellipsis2_2)
537	{
538	ellipsis = nowtok;
539	if (nowtok == tok_ellipsis4_2)
540	{
541	step = `2`;
542	nowtok = tok_ellipsis4;
543	}
544	else if (nowtok == tok_ellipsis2_2)
545	{
546	step = `2`;
547	nowtok = tok_ellipsis2;
548	}
549	state = `4`;
550	continue;
551	}
552	/ FALLTHROUGH /
553
554	case `5`:
555	if (nowtok != tok_charcode)
556	{
557	lr_error (cmfile, _("syntax error in %s definition: %s"),
558	"CHARMAP", _("invalid encoding given"));
559
560	lr_ignore_rest (cmfile, `0`);
561
562	state = `2`;
563	continue;
564	}
565
566	if (now->val.charcode.nbytes < result->mb_cur_min)
567	lr_error (cmfile, _("too few bytes in character encoding"));
568	else if (now->val.charcode.nbytes > result->mb_cur_max)
569	lr_error (cmfile, _("too many bytes in character encoding"));
570	else
571	charmap_new_char (cmfile, result, now->val.charcode.nbytes,
572	now->val.charcode.bytes, from_name, to_name,
573	ellipsis != tok_ellipsis2, step);
574
575	/ Ignore trailing comment silently. /
576	lr_ignore_rest (cmfile, `0`);
577
578	from_name = NULL;
579	to_name = NULL;
580	ellipsis = tok_none;
581	step = `1`;
582
583	state = `2`;
584	continue;
585
586	case `4`:
587	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
588	{
589	lr_error (cmfile, _("syntax error in %s definition: %s"),
590	"CHARMAP",
591	_("no symbolic name given for end of range"));
592
593	lr_ignore_rest (cmfile, `0`);
594	continue;
595	}
596
597	/ Copy the to-name in a safe place. /
598	if (nowtok == tok_bsymbol)
599	to_name = (char *) obstack_copy0 (&result->mem_pool,
600	cmfile->token.val.str.startmb,
601	cmfile->token.val.str.lenmb);
602	else
603	{
604	obstack_printf (&result->mem_pool, "U%08X",
605	cmfile->token.val.ucs4);
606	obstack_1grow (&result->mem_pool, `'\0'`);
607	to_name = (char *) obstack_finish (&result->mem_pool);
608	}
609
610	state = `5`;
611	continue;
612
613	case `90`:
614	if (nowtok != expected_tok)
615	lr_error (cmfile, _("\
616	%1$s: definition does not end with `END %1$s'"), expected_str);
617
618	lr_ignore_rest (cmfile, nowtok == expected_tok);
619	state = `91`;
620	continue;
621
622	case `91`:
623	/ Waiting for WIDTH... /
624	if (nowtok == tok_eol)
625	/ Ignore empty lines. /
626	continue;
627
628	if (nowtok == tok_width_default)
629	{
630	state = `92`;
631	continue;
632	}
633
634	if (nowtok == tok_width)
635	{
636	lr_ignore_rest (cmfile, `1`);
637	state = `93`;
638	continue;
639	}
640
641	if (nowtok == tok_width_variable)
642	{
643	lr_ignore_rest (cmfile, `1`);
644	state = `98`;
645	continue;
646	}
647
648	lr_error (cmfile, _("\
649	only WIDTH definitions are allowed to follow the CHARMAP definition"));
650
651	lr_ignore_rest (cmfile, `0`);
652	continue;
653
654	case `92`:
655	if (nowtok != tok_number)
656	lr_error (cmfile, _("value for %s must be an integer"),
657	"WIDTH_DEFAULT");
658	else
659	result->width_default = now->val.num;
660
661	lr_ignore_rest (cmfile, nowtok == tok_number);
662
663	state = `91`;
664	continue;
665
666	case `93`:
667	/ We now expect `END WIDTH' or lines of the format "%s %d\n" or*
668	"%s...%s %d\n". /*
669	if (nowtok == tok_eol)
670	/ ignore empty lines. /
671	continue;
672
673	if (nowtok == tok_end)
674	{
675	expected_tok = tok_width;
676	expected_str = "WIDTH";
677	state = `90`;
678	continue;
679	}
680
681	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
682	{
683	lr_error (cmfile, _("syntax error in %s definition: %s"),
684	"WIDTH", _("no symbolic name given"));
685
686	lr_ignore_rest (cmfile, `0`);
687	continue;
688	}
689
690	if (from_name != NULL)
691	obstack_free (&result->mem_pool, from_name);
692
693	if (nowtok == tok_bsymbol)
694	from_name = (char *) obstack_copy0 (&result->mem_pool,
695	now->val.str.startmb,
696	now->val.str.lenmb);
697	else
698	{
699	obstack_printf (&result->mem_pool, "U%08X",
700	cmfile->token.val.ucs4);
701	obstack_1grow (&result->mem_pool, `'\0'`);
702	from_name = (char *) obstack_finish (&result->mem_pool);
703	}
704
705	to_name = NULL;
706
707	state = `94`;
708	continue;
709
710	case `94`:
711	if (nowtok == tok_ellipsis3)
712	{
713	state = `95`;
714	continue;
715	}
716
717	case `96`:
718	if (nowtok != tok_number)
719	lr_error (cmfile, _("value for %s must be an integer"),
720	"WIDTH");
721	else
722	{
723	/ Store width for chars. /
724	new_width (cmfile, result, from_name, to_name, now->val.num);
725
726	from_name = NULL;
727	to_name = NULL;
728	}
729
730	lr_ignore_rest (cmfile, nowtok == tok_number);
731
732	state = `93`;
733	continue;
734
735	case `95`:
736	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
737	{
738	lr_error (cmfile, _("syntax error in %s definition: %s"),
739	"WIDTH", _("no symbolic name given for end of range"));
740
741	lr_ignore_rest (cmfile, `0`);
742
743	state = `93`;
744	continue;
745	}
746
747	if (nowtok == tok_bsymbol)
748	to_name = (char *) obstack_copy0 (&result->mem_pool,
749	now->val.str.startmb,
750	now->val.str.lenmb);
751	else
752	{
753	obstack_printf (&result->mem_pool, "U%08X",
754	cmfile->token.val.ucs4);
755	obstack_1grow (&result->mem_pool, `'\0'`);
756	to_name = (char *) obstack_finish (&result->mem_pool);
757	}
758
759	state = `96`;
760	continue;
761
762	case `98`:
763	/ We now expect `END WIDTH_VARIABLE' or lines of the format*
764	"%s\n" or "%s...%s\n". /*
765	if (nowtok == tok_eol)
766	/ ignore empty lines. /
767	continue;
768
769	if (nowtok == tok_end)
770	{
771	expected_tok = tok_width_variable;
772	expected_str = "WIDTH_VARIABLE";
773	state = `90`;
774	continue;
775	}
776
777	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
778	{
779	lr_error (cmfile, _("syntax error in %s definition: %s"),
780	"WIDTH_VARIABLE", _("no symbolic name given"));
781
782	lr_ignore_rest (cmfile, `0`);
783
784	continue;
785	}
786
787	if (from_name != NULL)
788	obstack_free (&result->mem_pool, from_name);
789
790	if (nowtok == tok_bsymbol)
791	from_name = (char *) obstack_copy0 (&result->mem_pool,
792	now->val.str.startmb,
793	now->val.str.lenmb);
794	else
795	{
796	obstack_printf (&result->mem_pool, "U%08X",
797	cmfile->token.val.ucs4);
798	obstack_1grow (&result->mem_pool, `'\0'`);
799	from_name = (char *) obstack_finish (&result->mem_pool);
800	}
801	to_name = NULL;
802
803	state = `99`;
804	continue;
805
806	case `99`:
807	if (nowtok == tok_ellipsis3)
808	state = `100`;
809
810	/ Store info. /
811	from_name = NULL;
812
813	/ Warn /
814	state = `98`;
815	continue;
816
817	case `100`:
818	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
819	{
820	lr_error (cmfile, _("syntax error in %s definition: %s"),
821	"WIDTH_VARIABLE",
822	_("no symbolic name given for end of range"));
823	lr_ignore_rest (cmfile, `0`);
824	continue;
825	}
826
827	if (nowtok == tok_bsymbol)
828	to_name = (char *) obstack_copy0 (&result->mem_pool,
829	now->val.str.startmb,
830	now->val.str.lenmb);
831	else
832	{
833	obstack_printf (&result->mem_pool, "U%08X",
834	cmfile->token.val.ucs4);
835	obstack_1grow (&result->mem_pool, `'\0'`);
836	to_name = (char *) obstack_finish (&result->mem_pool);
837	}
838
839	/ XXX Enter value into table. /
840
841	lr_ignore_rest (cmfile, `1`);
842
843	state = `98`;
844	continue;
845
846	default:
847	record_error (`5`, `0`, _("%s: error in state machine"),
848	__FILE__);
849	/ NOTREACHED /
850	}
851	break;
852	}
853
854	if (state != `91`)
855	record_error (`0`, `0`, _("%s: premature end of file"),
856	cmfile->fname);
857
858	lr_close (cmfile);
859
860	return result;
861	}
862
863
864	static void
865	new_width (struct linereader cmfile, struct* charmap_t *result,
866	const char from, const* char to, unsigned* long int width)
867	{
868	struct charseq *from_val;
869	struct charseq *to_val;
870
871	from_val = charmap_find_value (result, from, strlen (from));
872	if (from_val == NULL)
873	{
874	lr_error (cmfile, _("unknown character `%s'"), from);
875	return;
876	}
877
878	if (to == NULL)
879	to_val = from_val;
880	else
881	{
882	to_val = charmap_find_value (result, to, strlen (to));
883	if (to_val == NULL)
884	{
885	lr_error (cmfile, _("unknown character `%s'"), to);
886	return;
887	}
888
889	/ Make sure the number of bytes for the end points of the range*
890	is correct. /*
891	if (from_val->nbytes != to_val->nbytes)
892	{
893	lr_error (cmfile, _("\
894	number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
895	from_val->nbytes, to_val->nbytes);
896	return;
897	}
898	}
899
900	if (result->nwidth_rules >= result->nwidth_rules_max)
901	{
902	size_t new_size = result->nwidth_rules + `32`;
903	struct width_rule *new_rules =
904	(struct width_rule *) obstack_alloc (&result->mem_pool,
905	(new_size
906	* sizeof (struct width_rule)));
907
908	memcpy (new_rules, result->width_rules,
909	result->nwidth_rules_max * sizeof (struct width_rule));
910
911	result->width_rules = new_rules;
912	result->nwidth_rules_max = new_size;
913	}
914
915	result->width_rules[result->nwidth_rules].from = from_val;
916	result->width_rules[result->nwidth_rules].to = to_val;
917	result->width_rules[result->nwidth_rules].width = (unsigned int) width;
918	++result->nwidth_rules;
919	}
920
921
922	struct charseq *
923	charmap_find_value (const struct charmap_t cm, const* char *name, size_t len)
924	{
925	void *result;
926
927	return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
928	< `0` ? NULL : (struct charseq *) result);
929	}
930
931
932	static void
933	charmap_new_char (struct linereader lr, struct* charmap_t *cm,
934	size_t nbytes, unsigned char *bytes,
935	const char from, const* char *to,
936	int decimal_ellipsis, int step)
937	{
938	hash_table *ht = &cm->char_table;
939	hash_table *bt = &cm->byte_table;
940	struct obstack *ob = &cm->mem_pool;
941	char *from_end;
942	char *to_end;
943	const char *cp;
944	int prefix_len, len1, len2;
945	unsigned int from_nr, to_nr, cnt;
946	struct charseq *newp;
947
948	len1 = strlen (from);
949
950	if (to == NULL)
951	{
952	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
953	newp->nbytes = nbytes;
954	memcpy (newp->bytes, bytes, nbytes);
955	newp->name = from;
956
957	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
958	if ((from[`0`] == `'U'` \|\| from[`0`] == `'P'`) && (len1 == `5` \|\| len1 == `9`))
959	{
960	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
961	xxxx and xxxxxxxx are hexadecimal numbers. In this case
962	we use the value of xxxx or xxxxxxxx as the UCS4 value of
963	this character and we don't have to consult the repertoire
964	map.
965
966	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
967	and xxxxxxxx also give the code point in UCS4 but this must
968	be in the private, i.e., unassigned, area. This should be
969	used for characters which do not (yet) have an equivalent
970	in ISO 10646 and Unicode. /*
971	char *endp;
972
973	errno = `0`;
974	newp->ucs4 = strtoul (from + `1`, &endp, `16`);
975	if (endp - from != len1
976	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
977	\|\| newp->ucs4 >= `0x80000000`)
978	/ This wasn't successful. Signal this name cannot be a*
979	correct UCS value. /*
980	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
981	}
982
983	insert_entry (ht, from, len1, newp);
984	insert_entry (bt, newp->bytes, nbytes, newp);
985	/ Please note that it isn't a bug if a symbol is defined more*
986	than once. All later definitions are simply discarded. /*
987	return;
988	}
989
990	/ We have a range: the names must have names with equal prefixes*
991	and an equal number of digits, where the second number is greater
992	or equal than the first. /*
993	len2 = strlen (to);
994
995	if (len1 != len2)
996	{
997	illegal_range:
998	lr_error (lr, _("invalid names for character range"));
999	return;
1000	}
1001
1002	cp = &from[len1 - `1`];
1003	if (decimal_ellipsis)
1004	while (isdigit (*cp) && cp >= from)
1005	--cp;
1006	else
1007	while (isxdigit (*cp) && cp >= from)
1008	{
1009	if (!isdigit (cp) && !isupper (cp))
1010	lr_error (lr, _("\
1011	hexadecimal range format should use only capital characters"));
1012	--cp;
1013	}
1014
1015	prefix_len = (cp - from) + `1`;
1016
1017	if (cp == &from[len1 - `1`] \|\| strncmp (from, to, prefix_len) != `0`)
1018	goto illegal_range;
1019
1020	errno = `0`;
1021	from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? `10` : `16`);
1022	if (*from_end != `'\0'` \|\| (from_nr == UINT_MAX && errno == ERANGE)
1023	\|\| ((to_nr = strtoul (&to[prefix_len], &to_end,
1024	decimal_ellipsis ? `10` : `16`)) == UINT_MAX
1025	&& errno == ERANGE)
1026	\|\| *to_end != `'\0'`)
1027	{
1028	lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1029	return;
1030	}
1031
1032	if (from_nr > to_nr)
1033	{
1034	lr_error (lr, _("upper limit in range is smaller than lower limit"));
1035	return;
1036	}
1037
1038	for (cnt = from_nr; cnt <= to_nr; cnt += step)
1039	{
1040	char *name_end;
1041	obstack_printf (ob, decimal_ellipsis ? "%.s%0d" : "%.s%0X",
1042	prefix_len, from, len1 - prefix_len, cnt);
1043	obstack_1grow (ob, `'\0'`);
1044	name_end = obstack_finish (ob);
1045
1046	newp = (struct charseq ) obstack_alloc (ob, sizeof* (*newp) + nbytes);
1047	newp->nbytes = nbytes;
1048	memcpy (newp->bytes, bytes, nbytes);
1049	newp->name = name_end;
1050
1051	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1052	if ((name_end[`0`] == `'U'` \|\| name_end[`0`] == `'P'`)
1053	&& (len1 == `5` \|\| len1 == `9`))
1054	{
1055	/ Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where*
1056	xxxx and xxxxxxxx are hexadecimal numbers. In this case
1057	we use the value of xxxx or xxxxxxxx as the UCS4 value of
1058	this character and we don't have to consult the repertoire
1059	map.
1060
1061	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1062	and xxxxxxxx also give the code point in UCS4 but this must
1063	be in the private, i.e., unassigned, area. This should be
1064	used for characters which do not (yet) have an equivalent
1065	in ISO 10646 and Unicode. /*
1066	char *endp;
1067
1068	errno = `0`;
1069	newp->ucs4 = strtoul (name_end + `1`, &endp, `16`);
1070	if (endp - name_end != len1
1071	\|\| (newp->ucs4 == ~((uint32_t) `0`) && errno == ERANGE)
1072	\|\| newp->ucs4 >= `0x80000000`)
1073	/ This wasn't successful. Signal this name cannot be a*
1074	correct UCS value. /*
1075	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1076	}
1077
1078	insert_entry (ht, name_end, len1, newp);
1079	insert_entry (bt, newp->bytes, nbytes, newp);
1080	/ Please note we don't examine the return value since it is no error*
1081	if we have two definitions for a symbol. /*
1082
1083	/ Increment the value in the byte sequence. /
1084	if (++bytes[nbytes - `1`] == `'\0'`)
1085	{
1086	int b = nbytes - `2`;
1087
1088	do
1089	if (b < `0`)
1090	{
1091	lr_error (lr,
1092	_("resulting bytes for range not representable."));
1093	return;
1094	}
1095	while (++bytes[b--] == `0`);
1096	}
1097	}
1098	}
1099
1100
1101	struct charseq *
1102	charmap_find_symbol (const struct charmap_t cm, const* char *bytes,
1103	size_t nbytes)
1104	{
1105	void *result;
1106
1107	return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1108	< `0` ? NULL : (struct charseq *) result);
1109	}
1110

Browse the source code of glibc/locale/programs/charmap.c