utf-7.c source code [glibc/iconvdata/utf-7.c]

1	/ Conversion module for UTF-7.*
2	Copyright (C) 2000-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ UTF-7 is a legacy encoding used for transmitting Unicode within the*
20	ASCII character set, used primarily by mail agents. New programs
21	are encouraged to use UTF-8 instead.
22
23	UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The
24	original Base64 encoding is defined in RFC 2045. /*
25
26	#include <dlfcn.h>
27	#include <gconv.h>
28	#include <stdint.h>
29	#include <stdlib.h>
30	#include <string.h>
31
32
33	enum variant
34	{
35	UTF7,
36	UTF_7_IMAP
37	};
38
39	/ Must be in the same order as enum variant above. /
40	static const char names[] =
41	"UTF-7//\0"
42	"UTF-7-IMAP//\0"
43	"\0";
44
45	static uint32_t
46	shift_character (enum variant const var)
47	{
48	if (var == UTF7)
49	return `'+'`;
50	else if (var == UTF_7_IMAP)
51	return `'&'`;
52	else
53	abort ();
54	}
55
56	static bool
57	between (uint32_t const ch,
58	uint32_t const lower_bound, uint32_t const upper_bound)
59	{
60	return (ch >= lower_bound && ch <= upper_bound);
61	}
62
63	/ The set of "direct characters":*
64	A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
65	FOR UTF-7-IMAP
66	A-Z a-z 0-9 ' ( ) , - . / : ? space
67	! " # $ % + ; < = > @ [ \ ] ^ _ ` { \| } ~*
68	*/
69
70	static bool
71	isdirect (uint32_t ch, enum variant var)
72	{
73	if (var == UTF7)
74	return (between (ch, `'A'`, `'Z'`)
75	\|\| between (ch, `'a'`, `'z'`)
76	\|\| between (ch, `'0'`, `'9'`)
77	\|\| ch == `'\''` \|\| ch == `'('` \|\| ch == `')'`
78	\|\| between (ch, `','`, `'/'`)
79	\|\| ch == `':'` \|\| ch == `'?'`
80	\|\| ch == `' '` \|\| ch == `'\t'` \|\| ch == `'\n'` \|\| ch == `'\r'`);
81	else if (var == UTF_7_IMAP)
82	return (ch != `'&'` && between (ch, `' '`, `'~'`));
83	abort ();
84	}
85
86
87	/ The set of "direct and optional direct characters":*
88	A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
89	(UTF-7 only)
90	! " # $ % & ; < = > @ [ ] ^ _ ` { \| }*
91	*/
92
93	static bool
94	isxdirect (uint32_t ch, enum variant var)
95	{
96	if (isdirect (ch, var))
97	return true;
98	if (var != UTF7)
99	return false;
100	return between (ch, `'!'`, `'&'`)
101	\|\| ch == `'*'`
102	\|\| between (ch, `';'`, `'@'`)
103	\|\| (between (ch, `'['`, '`') && ch != `'\\'`)
104	\|\| between (ch, `'{'`, `'}'`);
105	}
106
107
108	/ Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7*
109	only): Modified base64 + '-' (shift back character)
110	A-Z a-z 0-9 + / -
111	*/
112
113	static bool
114	needs_explicit_shift (uint32_t ch)
115	{
116	return (between (ch, `'A'`, `'Z'`)
117	\|\| between (ch, `'a'`, `'z'`)
118	\|\| between (ch, `'/'`, `'9'`) \|\| ch == `'+'` \|\| ch == `'-'`);
119	}
120
121
122	/ Converts a value in the range 0..63 to a base64 encoded char. /
123	static unsigned char
124	base64 (unsigned int i, enum variant var)
125	{
126	if (i < `26`)
127	return i + `'A'`;
128	else if (i < `52`)
129	return i - `26` + `'a'`;
130	else if (i < `62`)
131	return i - `52` + `'0'`;
132	else if (i == `62`)
133	return `'+'`;
134	else if (i == `63` && var == UTF7)
135	return `'/'`;
136	else if (i == `63` && var == UTF_7_IMAP)
137	return `','`;
138	else
139	abort ();
140	}
141
142
143	/ Definitions used in the body of the `gconv' function. /
144	#define DEFINE_INIT 0
145	#define DEFINE_FINI 0
146	#define FROM_LOOP from_utf7_loop
147	#define TO_LOOP to_utf7_loop
148	#define MIN_NEEDED_FROM 1
149	#define MAX_NEEDED_FROM 6
150	#define MIN_NEEDED_TO 4
151	#define MAX_NEEDED_TO 4
152	#define ONE_DIRECTION 0
153	#define FROM_DIRECTION (dir == from_utf7)
154	#define PREPARE_LOOP \
155	mbstate_t saved_state; \
156	mbstate_t *statep = data->__statep; \
157	enum direction dir = ((struct utf7_data *) step->__data)->dir; \
158	enum variant var = ((struct utf7_data *) step->__data)->var;
159	#define EXTRA_LOOP_ARGS , statep, var
160
161
162	enum direction
163	{
164	illegal_dir,
165	from_utf7,
166	to_utf7
167	};
168
169	struct utf7_data
170	{
171	enum direction dir;
172	enum variant var;
173	};
174
175	/ Since we might have to reset input pointer we must be able to save*
176	and restore the state. /*
177	#define SAVE_RESET_STATE(Save) \
178	if (Save) \
179	saved_state = *statep; \
180	else \
181	*statep = saved_state
182
183	int
184	gconv_init (struct __gconv_step *step)
185	{
186	/ Determine which direction. /
187	struct utf7_data *new_data;
188	enum direction dir = illegal_dir;
189
190	enum variant var = `0`;
191	for (const char name = names; name != `'\0'`;
192	name = __rawmemchr (name, `'\0'`) + `1`)
193	{
194	if (__strcasecmp (step->__from_name, name) == `0`)
195	{
196	dir = from_utf7;
197	break;
198	}
199	else if (__strcasecmp (step->__to_name, name) == `0`)
200	{
201	dir = to_utf7;
202	break;
203	}
204	++var;
205	}
206
207	if (__glibc_likely (dir != illegal_dir))
208	{
209	new_data = malloc (sizeof (*new_data));
210	if (new_data == NULL)
211	return __GCONV_NOMEM;
212
213	new_data->dir = dir;
214	new_data->var = var;
215	step->__data = new_data;
216
217	if (dir == from_utf7)
218	{
219	step->__min_needed_from = MIN_NEEDED_FROM;
220	step->__max_needed_from = MAX_NEEDED_FROM;
221	step->__min_needed_to = MIN_NEEDED_TO;
222	step->__max_needed_to = MAX_NEEDED_TO;
223	}
224	else
225	{
226	step->__min_needed_from = MIN_NEEDED_TO;
227	step->__max_needed_from = MAX_NEEDED_TO;
228	step->__min_needed_to = MIN_NEEDED_FROM;
229	step->__max_needed_to = MAX_NEEDED_FROM;
230	}
231	}
232	else
233	return __GCONV_NOCONV;
234
235	step->__stateful = `1`;
236
237	return __GCONV_OK;
238	}
239
240	void
241	gconv_end (struct __gconv_step *data)
242	{
243	free (data->__data);
244	}
245
246
247
248	/ First define the conversion function from UTF-7 to UCS4.*
249	The state is structured as follows:
250	__count bit 2..0: zero
251	__count bit 8..3: shift
252	__wch: data
253	Precise meaning:
254	shift data
255	0 -- not inside base64 encoding
256	1..32 XX..XX00..00 inside base64, (32 - shift) bits pending
257	This state layout is simpler than relying on STORE_REST/UNPACK_BYTES.
258
259	When shift = 0, __wch needs to store at most one lookahead byte (see
260	__GCONV_INCOMPLETE_INPUT below).
261	*/
262	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
263	#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
264	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
265	#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
266	#define LOOPFCT FROM_LOOP
267	#define BODY \
268	{ \
269	uint_fast8_t ch = *inptr; \
270	\
271	if ((statep->__count >> 3) == 0) \
272	{ \
273	/* base64 encoding inactive. */ \
274	if (isxdirect (ch, var)) \
275	{ \
276	inptr++; \
277	put32 (outptr, ch); \
278	outptr += 4; \
279	} \
280	else if (__glibc_likely (ch == shift_character (var))) \
281	{ \
282	if (__glibc_unlikely (inptr + 2 > inend)) \
283	{ \
284	/* Not enough input available. */ \
285	result = __GCONV_INCOMPLETE_INPUT; \
286	break; \
287	} \
288	if (inptr[1] == '-') \
289	{ \
290	inptr += 2; \
291	put32 (outptr, ch); \
292	outptr += 4; \
293	} \
294	else \
295	{ \
296	/* Switch into base64 mode. */ \
297	inptr++; \
298	statep->__count = (32 << 3); \
299	statep->__value.__wch = 0; \
300	} \
301	} \
302	else \
303	{ \
304	/* The input is invalid. */ \
305	STANDARD_FROM_LOOP_ERR_HANDLER (1); \
306	} \
307	} \
308	else \
309	{ \
310	/* base64 encoding active. */ \
311	uint32_t i; \
312	int shift; \
313	\
314	if (ch >= 'A' && ch <= 'Z') \
315	i = ch - 'A'; \
316	else if (ch >= 'a' && ch <= 'z') \
317	i = ch - 'a' + 26; \
318	else if (ch >= '0' && ch <= '9') \
319	i = ch - '0' + 52; \
320	else if (ch == '+') \
321	i = 62; \
322	else if ((var == UTF7 && ch == '/') \
323	\|\| (var == UTF_7_IMAP && ch == ',')) \
324	i = 63; \
325	else \
326	{ \
327	/* Terminate base64 encoding. */ \
328	\
329	/* If accumulated data is nonzero, the input is invalid. */ \
330	/* Also, partial UTF-16 characters are invalid. */ \
331	/* In IMAP variant, must be terminated by '-'. */ \
332	if (__glibc_unlikely (statep->__value.__wch != 0) \
333	\|\| __glibc_unlikely ((statep->__count >> 3) <= 26) \
334	\|\| __glibc_unlikely (var == UTF_7_IMAP && ch != '-')) \
335	{ \
336	STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1)); \
337	} \
338	\
339	if (ch == '-') \
340	inptr++; \
341	\
342	statep->__count = 0; \
343	continue; \
344	} \
345	\
346	/* Concatenate the base64 integer i to the accumulator. */ \
347	shift = (statep->__count >> 3); \
348	if (shift > 6) \
349	{ \
350	uint32_t wch; \
351	\
352	shift -= 6; \
353	wch = statep->__value.__wch \| (i << shift); \
354	\
355	if (shift <= 16 && shift > 10) \
356	{ \
357	/* An UTF-16 character has just been completed. */ \
358	uint32_t wc1 = wch >> 16; \
359	\
360	/* UTF-16: When we see a High Surrogate, we must also decode \
361	the following Low Surrogate. */ \
362	if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) \
363	{ \
364	wch = wch << 16; \
365	shift += 16; \
366	put32 (outptr, wc1); \
367	outptr += 4; \
368	} \
369	} \
370	else if (shift <= 10 && shift > 4) \
371	{ \
372	/* After a High Surrogate, verify that the next 16 bit \
373	indeed form a Low Surrogate. */ \
374	uint32_t wc2 = wch & 0xffff; \
375	\
376	if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000)) \
377	{ \
378	STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
379	} \
380	} \
381	\
382	statep->__value.__wch = wch; \
383	} \
384	else \
385	{ \
386	/* An UTF-16 surrogate pair has just been completed. */ \
387	uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16; \
388	uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff) \
389	\| (i >> (6 - shift)); \
390	\
391	statep->__value.__wch = (i << shift) << 26; \
392	shift += 26; \
393	\
394	assert (wc1 >= 0xd800 && wc1 < 0xdc00); \
395	assert (wc2 >= 0xdc00 && wc2 < 0xe000); \
396	put32 (outptr, \
397	0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00)); \
398	outptr += 4; \
399	} \
400	\
401	statep->__count = shift << 3; \
402	\
403	/* Now that we digested the input increment the input pointer. */ \
404	inptr++; \
405	} \
406	}
407	#define LOOP_NEED_FLAGS
408	#define EXTRA_LOOP_DECLS , mbstate_t *statep, enum variant var
409	#include <iconv/loop.c>
410
411
412	/ Next, define the conversion from UCS4 to UTF-7.*
413	The state is structured as follows:
414	__count bit 2..0: zero
415	__count bit 4..3: shift
416	__count bit 8..5: data
417	Precise meaning:
418	shift data
419	0 0 not inside base64 encoding
420	1 0 inside base64, no pending bits
421	2 XX00 inside base64, 2 bits known for next byte
422	3 XXXX inside base64, 4 bits known for next byte
423
424	__count bit 2..0 and __wch are always zero, because this direction
425	never returns __GCONV_INCOMPLETE_INPUT.
426	*/
427	#define MIN_NEEDED_INPUT MIN_NEEDED_TO
428	#define MAX_NEEDED_INPUT MAX_NEEDED_TO
429	#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
430	#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
431	#define LOOPFCT TO_LOOP
432	#define BODY \
433	{ \
434	uint32_t ch = get32 (inptr); \
435	\
436	if ((statep->__count & 0x18) == 0) \
437	{ \
438	/* base64 encoding inactive */ \
439	if (isdirect (ch, var)) \
440	{ \
441	*outptr++ = (unsigned char) ch; \
442	} \
443	else \
444	{ \
445	size_t count; \
446	\
447	if (ch == shift_character (var)) \
448	count = 2; \
449	else if (ch < 0x10000) \
450	count = 3; \
451	else if (ch < 0x110000) \
452	count = 6; \
453	else \
454	STANDARD_TO_LOOP_ERR_HANDLER (4); \
455	\
456	if (__glibc_unlikely (outptr + count > outend)) \
457	{ \
458	result = __GCONV_FULL_OUTPUT; \
459	break; \
460	} \
461	\
462	*outptr++ = shift_character (var); \
463	if (ch == shift_character (var)) \
464	*outptr++ = '-'; \
465	else if (ch < 0x10000) \
466	{ \
467	*outptr++ = base64 (ch >> 10, var); \
468	*outptr++ = base64 ((ch >> 4) & 0x3f, var); \
469	statep->__count = ((ch & 15) << 5) \| (3 << 3); \
470	} \
471	else if (ch < 0x110000) \
472	{ \
473	uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \
474	uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \
475	\
476	ch = (ch1 << 16) \| ch2; \
477	*outptr++ = base64 (ch >> 26, var); \
478	*outptr++ = base64 ((ch >> 20) & 0x3f, var); \
479	*outptr++ = base64 ((ch >> 14) & 0x3f, var); \
480	*outptr++ = base64 ((ch >> 8) & 0x3f, var); \
481	*outptr++ = base64 ((ch >> 2) & 0x3f, var); \
482	statep->__count = ((ch & 3) << 7) \| (2 << 3); \
483	} \
484	else \
485	abort (); \
486	} \
487	} \
488	else \
489	{ \
490	/* base64 encoding active */ \
491	if ((var == UTF_7_IMAP && ch == '&') \|\| isdirect (ch, var)) \
492	{ \
493	/* deactivate base64 encoding */ \
494	size_t count; \
495	\
496	count = ((statep->__count & 0x18) >= 0x10) \
497	+ (var == UTF_7_IMAP \|\| needs_explicit_shift (ch)) \
498	+ (var == UTF_7_IMAP && ch == '&') \
499	+ 1; \
500	if (__glibc_unlikely (outptr + count > outend)) \
501	{ \
502	result = __GCONV_FULL_OUTPUT; \
503	break; \
504	} \
505	\
506	if ((statep->__count & 0x18) >= 0x10) \
507	*outptr++ = base64 ((statep->__count >> 3) & ~3, var); \
508	if (var == UTF_7_IMAP \|\| needs_explicit_shift (ch)) \
509	*outptr++ = '-'; \
510	*outptr++ = (unsigned char) ch; \
511	if (var == UTF_7_IMAP && ch == '&') \
512	*outptr++ = '-'; \
513	statep->__count = 0; \
514	} \
515	else \
516	{ \
517	size_t count; \
518	\
519	if (ch < 0x10000) \
520	count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2); \
521	else if (ch < 0x110000) \
522	count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5); \
523	else \
524	STANDARD_TO_LOOP_ERR_HANDLER (4); \
525	\
526	if (__glibc_unlikely (outptr + count > outend)) \
527	{ \
528	result = __GCONV_FULL_OUTPUT; \
529	break; \
530	} \
531	\
532	if (ch < 0x10000) \
533	{ \
534	switch ((statep->__count >> 3) & 3) \
535	{ \
536	case 1: \
537	*outptr++ = base64 (ch >> 10, var); \
538	*outptr++ = base64 ((ch >> 4) & 0x3f, var); \
539	statep->__count = ((ch & 15) << 5) \| (3 << 3); \
540	break; \
541	case 2: \
542	*outptr++ = \
543	base64 (((statep->__count >> 3) & ~3) \| (ch >> 12), \
544	var); \
545	*outptr++ = base64 ((ch >> 6) & 0x3f, var); \
546	*outptr++ = base64 (ch & 0x3f, var); \
547	statep->__count = (1 << 3); \
548	break; \
549	case 3: \
550	*outptr++ = \
551	base64 (((statep->__count >> 3) & ~3) \| (ch >> 14), \
552	var); \
553	*outptr++ = base64 ((ch >> 8) & 0x3f, var); \
554	*outptr++ = base64 ((ch >> 2) & 0x3f, var); \
555	statep->__count = ((ch & 3) << 7) \| (2 << 3); \
556	break; \
557	default: \
558	abort (); \
559	} \
560	} \
561	else if (ch < 0x110000) \
562	{ \
563	uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \
564	uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \
565	\
566	ch = (ch1 << 16) \| ch2; \
567	switch ((statep->__count >> 3) & 3) \
568	{ \
569	case 1: \
570	*outptr++ = base64 (ch >> 26, var); \
571	*outptr++ = base64 ((ch >> 20) & 0x3f, var); \
572	*outptr++ = base64 ((ch >> 14) & 0x3f, var); \
573	*outptr++ = base64 ((ch >> 8) & 0x3f, var); \
574	*outptr++ = base64 ((ch >> 2) & 0x3f, var); \
575	statep->__count = ((ch & 3) << 7) \| (2 << 3); \
576	break; \
577	case 2: \
578	*outptr++ = \
579	base64 (((statep->__count >> 3) & ~3) \| (ch >> 28), \
580	var); \
581	*outptr++ = base64 ((ch >> 22) & 0x3f, var); \
582	*outptr++ = base64 ((ch >> 16) & 0x3f, var); \
583	*outptr++ = base64 ((ch >> 10) & 0x3f, var); \
584	*outptr++ = base64 ((ch >> 4) & 0x3f, var); \
585	statep->__count = ((ch & 15) << 5) \| (3 << 3); \
586	break; \
587	case 3: \
588	*outptr++ = \
589	base64 (((statep->__count >> 3) & ~3) \| (ch >> 30), \
590	var); \
591	*outptr++ = base64 ((ch >> 24) & 0x3f, var); \
592	*outptr++ = base64 ((ch >> 18) & 0x3f, var); \
593	*outptr++ = base64 ((ch >> 12) & 0x3f, var); \
594	*outptr++ = base64 ((ch >> 6) & 0x3f, var); \
595	*outptr++ = base64 (ch & 0x3f, var); \
596	statep->__count = (1 << 3); \
597	break; \
598	default: \
599	abort (); \
600	} \
601	} \
602	else \
603	abort (); \
604	} \
605	} \
606	\
607	/* Now that we wrote the output increment the input pointer. */ \
608	inptr += 4; \
609	}
610	#define LOOP_NEED_FLAGS
611	#define EXTRA_LOOP_DECLS , mbstate_t *statep, enum variant var
612	#include <iconv/loop.c>
613
614
615	/ Since this is a stateful encoding we have to provide code which resets*
616	the output state to the initial state. This has to be done during the
617	flushing. /*
618	#define EMIT_SHIFT_TO_INIT \
619	if (FROM_DIRECTION) \
620	/* Nothing to emit. */ \
621	memset (data->__statep, '\0', sizeof (mbstate_t)); \
622	else \
623	{ \
624	/* The "to UTF-7" direction. Flush the remaining bits and terminate \
625	with a '-' byte. This will guarantee correct decoding if more \
626	UTF-7 encoded text is added afterwards. */ \
627	int state = data->__statep->__count; \
628	\
629	if (state & 0x18) \
630	{ \
631	/* Deactivate base64 encoding. */ \
632	size_t count = ((state & 0x18) >= 0x10) + 1; \
633	\
634	if (__glibc_unlikely (outbuf + count > outend)) \
635	/* We don't have enough room in the output buffer. */ \
636	status = __GCONV_FULL_OUTPUT; \
637	else \
638	{ \
639	/* Write out the shift sequence. */ \
640	if ((state & 0x18) >= 0x10) \
641	*outbuf++ = base64 ((state >> 3) & ~3, var); \
642	*outbuf++ = '-'; \
643	\
644	data->__statep->__count = 0; \
645	} \
646	} \
647	else \
648	data->__statep->__count = 0; \
649	}
650
651
652	/ Now define the toplevel functions. /
653	#include <iconv/skeleton.c>
654

Browse the source code of glibc/iconvdata/utf-7.c