gconv_simple.c source code [glibc/iconv/gconv_simple.c]

1	/ Simple transformations functions.*
2	Copyright (C) 1997-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <byteswap.h>
20	#include <dlfcn.h>
21	#include <endian.h>
22	#include <errno.h>
23	#include <gconv.h>
24	#include <stdint.h>
25	#include <stdlib.h>
26	#include <string.h>
27	#include <wchar.h>
28	#include <sys/param.h>
29	#include <gconv_int.h>
30
31	#define BUILTIN_ALIAS(s1, s2) /* nothing */
32	#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
33	MinF, MaxF, MinT, MaxT) \
34	extern int Fct (struct __gconv_step , struct __gconv_step_data , \
35	const unsigned char *, const unsigned char , \
36	unsigned char *, size_t , int, int);
37	#include "gconv_builtin.h"
38
39
40	#ifndef EILSEQ
41	# define EILSEQ EINVAL
42	#endif
43
44
45	/ Specialized conversion function for a single byte to INTERNAL, recognizing*
46	only ASCII characters. /*
47	wint_t
48	__gconv_btwoc_ascii (struct __gconv_step step, unsigned* char c)
49	{
50	if (c < `0x80`)
51	return c;
52	else
53	return WEOF;
54	}
55
56
57	/ Transform from the internal, UCS4-like format, to UCS4. The*
58	difference between the internal ucs4 format and the real UCS4
59	format is, if any, the endianness. The Unicode/ISO 10646 says that
60	unless some higher protocol specifies it differently, the byte
61	order is big endian./*
62	#define DEFINE_INIT 0
63	#define DEFINE_FINI 0
64	#define MIN_NEEDED_FROM 4
65	#define MIN_NEEDED_TO 4
66	#define FROM_DIRECTION 1
67	#define FROM_LOOP internal_ucs4_loop
68	#define TO_LOOP internal_ucs4_loop /* This is not used. */
69	#define FUNCTION_NAME __gconv_transform_internal_ucs4
70	#define ONE_DIRECTION 0
71
72
73	static inline int
74	__attribute ((always_inline))
75	internal_ucs4_loop (struct __gconv_step *step,
76	struct __gconv_step_data *step_data,
77	const unsigned char *inptrp, const* unsigned char *inend,
78	unsigned char *outptrp, const* unsigned char *outend,
79	size_t *irreversible)
80	{
81	const unsigned char inptr = inptrp;
82	unsigned char outptr = outptrp;
83	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
84	int result;
85
86	#if __BYTE_ORDER == __LITTLE_ENDIAN
87	/ Sigh, we have to do some real work. /
88	size_t cnt;
89
90	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
91	{
92	uint32_t val = get32 (inptr);
93	put32 (outptr, __builtin_bswap32 (val));
94	}
95
96	*inptrp = inptr;
97	*outptrp = outptr;
98	#elif __BYTE_ORDER == __BIG_ENDIAN
99	/ Simply copy the data. /
100	inptrp = inptr + n_convert `4`;
101	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
102	#else
103	# error "This endianness is not supported."
104	#endif
105
106	/ Determine the status. /
107	if (*inptrp == inend)
108	result = __GCONV_EMPTY_INPUT;
109	else if (*outptrp + `4` > outend)
110	result = __GCONV_FULL_OUTPUT;
111	else
112	result = __GCONV_INCOMPLETE_INPUT;
113
114	return result;
115	}
116
117
118	static inline int
119	__attribute ((always_inline))
120	internal_ucs4_loop_single (struct __gconv_step *step,
121	struct __gconv_step_data *step_data,
122	const unsigned char **inptrp,
123	const unsigned char *inend,
124	unsigned char **outptrp,
125	const unsigned char *outend,
126	size_t *irreversible)
127	{
128	mbstate_t *state = step_data->__statep;
129	size_t cnt = state->__count & `7`;
130
131	while (*inptrp < inend && cnt < `4`)
132	state->__value.__wchb[cnt++] = (inptrp)++;
133
134	if (__glibc_unlikely (cnt < `4`))
135	{
136	/ Still not enough bytes. Store the ones in the input buffer. /
137	state->__count &= ~`7`;
138	state->__count \|= cnt;
139
140	return __GCONV_INCOMPLETE_INPUT;
141	}
142
143	#if __BYTE_ORDER == __LITTLE_ENDIAN
144	(*outptrp)[`0`] = state->__value.__wchb[`3`];
145	(*outptrp)[`1`] = state->__value.__wchb[`2`];
146	(*outptrp)[`2`] = state->__value.__wchb[`1`];
147	(*outptrp)[`3`] = state->__value.__wchb[`0`];
148
149	#elif __BYTE_ORDER == __BIG_ENDIAN
150	/ XXX unaligned /
151	(*outptrp)[`0`] = state->__value.__wchb[`0`];
152	(*outptrp)[`1`] = state->__value.__wchb[`1`];
153	(*outptrp)[`2`] = state->__value.__wchb[`2`];
154	(*outptrp)[`3`] = state->__value.__wchb[`3`];
155	#else
156	# error "This endianness is not supported."
157	#endif
158	*outptrp += `4`;
159
160	/ Clear the state buffer. /
161	state->__count &= ~`7`;
162
163	return __GCONV_OK;
164	}
165
166	#include <iconv/skeleton.c>
167
168
169	/ Transform from UCS4 to the internal, UCS4-like format. Unlike*
170	for the other direction we have to check for correct values here. /*
171	#define DEFINE_INIT 0
172	#define DEFINE_FINI 0
173	#define MIN_NEEDED_FROM 4
174	#define MIN_NEEDED_TO 4
175	#define FROM_DIRECTION 1
176	#define FROM_LOOP ucs4_internal_loop
177	#define TO_LOOP ucs4_internal_loop /* This is not used. */
178	#define FUNCTION_NAME __gconv_transform_ucs4_internal
179	#define ONE_DIRECTION 0
180
181
182	static inline int
183	__attribute ((always_inline))
184	ucs4_internal_loop (struct __gconv_step *step,
185	struct __gconv_step_data *step_data,
186	const unsigned char *inptrp, const* unsigned char *inend,
187	unsigned char *outptrp, const* unsigned char *outend,
188	size_t *irreversible)
189	{
190	int flags = step_data->__flags;
191	const unsigned char inptr = inptrp;
192	unsigned char outptr = outptrp;
193	int result;
194
195	for (; inptr + `4` <= inend && outptr + `4` <= outend; inptr += `4`)
196	{
197	uint32_t inval = get32 (inptr);
198	#if __BYTE_ORDER == __LITTLE_ENDIAN
199	inval = __builtin_bswap32 (inval);
200	#endif
201
202	if (__glibc_unlikely (inval > `0x7fffffff`))
203	{
204	/ The value is too large. We don't try transliteration here since*
205	this is not an error because of the lack of possibilities to
206	represent the result. This is a genuine bug in the input since
207	UCS4 does not allow such values. /*
208	if (irreversible == NULL)
209	/ We are transliterating, don't try to correct anything. /
210	return __GCONV_ILLEGAL_INPUT;
211
212	if (flags & __GCONV_IGNORE_ERRORS)
213	{
214	/ Just ignore this character. /
215	++*irreversible;
216	continue;
217	}
218
219	*inptrp = inptr;
220	*outptrp = outptr;
221	return __GCONV_ILLEGAL_INPUT;
222	}
223
224	put32 (outptr, inval);
225	outptr += sizeof (uint32_t);
226	}
227
228	*inptrp = inptr;
229	*outptrp = outptr;
230
231	/ Determine the status. /
232	if (*inptrp == inend)
233	result = __GCONV_EMPTY_INPUT;
234	else if (*outptrp + `4` > outend)
235	result = __GCONV_FULL_OUTPUT;
236	else
237	result = __GCONV_INCOMPLETE_INPUT;
238
239	return result;
240	}
241
242
243	static inline int
244	__attribute ((always_inline))
245	ucs4_internal_loop_single (struct __gconv_step *step,
246	struct __gconv_step_data *step_data,
247	const unsigned char **inptrp,
248	const unsigned char *inend,
249	unsigned char **outptrp,
250	const unsigned char *outend,
251	size_t *irreversible)
252	{
253	mbstate_t *state = step_data->__statep;
254	int flags = step_data->__flags;
255	size_t cnt = state->__count & `7`;
256
257	while (*inptrp < inend && cnt < `4`)
258	state->__value.__wchb[cnt++] = (inptrp)++;
259
260	if (__glibc_unlikely (cnt < `4`))
261	{
262	/ Still not enough bytes. Store the ones in the input buffer. /
263	state->__count &= ~`7`;
264	state->__count \|= cnt;
265
266	return __GCONV_INCOMPLETE_INPUT;
267	}
268
269	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`0`] > `0x80`,
270	`0`))
271	{
272	/ The value is too large. We don't try transliteration here since*
273	this is not an error because of the lack of possibilities to
274	represent the result. This is a genuine bug in the input since
275	UCS4 does not allow such values. /*
276	if (!(flags & __GCONV_IGNORE_ERRORS))
277	{
278	*inptrp -= cnt - (state->__count & `7`);
279	return __GCONV_ILLEGAL_INPUT;
280	}
281	}
282	else
283	{
284	#if __BYTE_ORDER == __LITTLE_ENDIAN
285	(*outptrp)[`0`] = state->__value.__wchb[`3`];
286	(*outptrp)[`1`] = state->__value.__wchb[`2`];
287	(*outptrp)[`2`] = state->__value.__wchb[`1`];
288	(*outptrp)[`3`] = state->__value.__wchb[`0`];
289	#elif __BYTE_ORDER == __BIG_ENDIAN
290	(*outptrp)[`0`] = state->__value.__wchb[`0`];
291	(*outptrp)[`1`] = state->__value.__wchb[`1`];
292	(*outptrp)[`2`] = state->__value.__wchb[`2`];
293	(*outptrp)[`3`] = state->__value.__wchb[`3`];
294	#endif
295
296	*outptrp += `4`;
297	}
298
299	/ Clear the state buffer. /
300	state->__count &= ~`7`;
301
302	return __GCONV_OK;
303	}
304
305	#include <iconv/skeleton.c>
306
307
308	/ Similarly for the little endian form. /
309	#define DEFINE_INIT 0
310	#define DEFINE_FINI 0
311	#define MIN_NEEDED_FROM 4
312	#define MIN_NEEDED_TO 4
313	#define FROM_DIRECTION 1
314	#define FROM_LOOP internal_ucs4le_loop
315	#define TO_LOOP internal_ucs4le_loop /* This is not used. */
316	#define FUNCTION_NAME __gconv_transform_internal_ucs4le
317	#define ONE_DIRECTION 0
318
319
320	static inline int
321	__attribute ((always_inline))
322	internal_ucs4le_loop (struct __gconv_step *step,
323	struct __gconv_step_data *step_data,
324	const unsigned char *inptrp, const* unsigned char *inend,
325	unsigned char *outptrp, const* unsigned char *outend,
326	size_t *irreversible)
327	{
328	const unsigned char inptr = inptrp;
329	unsigned char outptr = outptrp;
330	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
331	int result;
332
333	#if __BYTE_ORDER == __BIG_ENDIAN
334	/ Sigh, we have to do some real work. /
335	size_t cnt;
336
337	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
338	{
339	uint32_t val = get32 (inptr);
340	put32 (outptr, __builtin_bswap32 (val));
341	}
342
343	*inptrp = inptr;
344	*outptrp = outptr;
345	#elif __BYTE_ORDER == __LITTLE_ENDIAN
346	/ Simply copy the data. /
347	inptrp = inptr + n_convert `4`;
348	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
349	#else
350	# error "This endianness is not supported."
351	#endif
352
353	/ Determine the status. /
354	if (*inptrp == inend)
355	result = __GCONV_EMPTY_INPUT;
356	else if (*outptrp + `4` > outend)
357	result = __GCONV_FULL_OUTPUT;
358	else
359	result = __GCONV_INCOMPLETE_INPUT;
360
361	return result;
362	}
363
364
365	static inline int
366	__attribute ((always_inline))
367	internal_ucs4le_loop_single (struct __gconv_step *step,
368	struct __gconv_step_data *step_data,
369	const unsigned char **inptrp,
370	const unsigned char *inend,
371	unsigned char **outptrp,
372	const unsigned char *outend,
373	size_t *irreversible)
374	{
375	mbstate_t *state = step_data->__statep;
376	size_t cnt = state->__count & `7`;
377
378	while (*inptrp < inend && cnt < `4`)
379	state->__value.__wchb[cnt++] = (inptrp)++;
380
381	if (__glibc_unlikely (cnt < `4`))
382	{
383	/ Still not enough bytes. Store the ones in the input buffer. /
384	state->__count &= ~`7`;
385	state->__count \|= cnt;
386
387	return __GCONV_INCOMPLETE_INPUT;
388	}
389
390	#if __BYTE_ORDER == __BIG_ENDIAN
391	(*outptrp)[`0`] = state->__value.__wchb[`3`];
392	(*outptrp)[`1`] = state->__value.__wchb[`2`];
393	(*outptrp)[`2`] = state->__value.__wchb[`1`];
394	(*outptrp)[`3`] = state->__value.__wchb[`0`];
395
396	#else
397	/ XXX unaligned /
398	(*outptrp)[`0`] = state->__value.__wchb[`0`];
399	(*outptrp)[`1`] = state->__value.__wchb[`1`];
400	(*outptrp)[`2`] = state->__value.__wchb[`2`];
401	(*outptrp)[`3`] = state->__value.__wchb[`3`];
402
403	#endif
404
405	*outptrp += `4`;
406
407	/ Clear the state buffer. /
408	state->__count &= ~`7`;
409
410	return __GCONV_OK;
411	}
412
413	#include <iconv/skeleton.c>
414
415
416	/ And finally from UCS4-LE to the internal encoding. /
417	#define DEFINE_INIT 0
418	#define DEFINE_FINI 0
419	#define MIN_NEEDED_FROM 4
420	#define MIN_NEEDED_TO 4
421	#define FROM_DIRECTION 1
422	#define FROM_LOOP ucs4le_internal_loop
423	#define TO_LOOP ucs4le_internal_loop /* This is not used. */
424	#define FUNCTION_NAME __gconv_transform_ucs4le_internal
425	#define ONE_DIRECTION 0
426
427
428	static inline int
429	__attribute ((always_inline))
430	ucs4le_internal_loop (struct __gconv_step *step,
431	struct __gconv_step_data *step_data,
432	const unsigned char *inptrp, const* unsigned char *inend,
433	unsigned char *outptrp, const* unsigned char *outend,
434	size_t *irreversible)
435	{
436	int flags = step_data->__flags;
437	const unsigned char inptr = inptrp;
438	unsigned char outptr = outptrp;
439	int result;
440
441	for (; inptr + `4` <= inend && outptr + `4` <= outend; inptr += `4`)
442	{
443	uint32_t inval = get32 (inptr);
444	#if __BYTE_ORDER == __BIG_ENDIAN
445	inval = __builtin_bswap32 (inval);
446	#endif
447
448	if (__glibc_unlikely (inval > `0x7fffffff`))
449	{
450	/ The value is too large. We don't try transliteration here since*
451	this is not an error because of the lack of possibilities to
452	represent the result. This is a genuine bug in the input since
453	UCS4 does not allow such values. /*
454	if (irreversible == NULL)
455	/ We are transliterating, don't try to correct anything. /
456	return __GCONV_ILLEGAL_INPUT;
457
458	if (flags & __GCONV_IGNORE_ERRORS)
459	{
460	/ Just ignore this character. /
461	++*irreversible;
462	continue;
463	}
464
465	*inptrp = inptr;
466	*outptrp = outptr;
467	return __GCONV_ILLEGAL_INPUT;
468	}
469
470	put32 (outptr, inval);
471	outptr += sizeof (uint32_t);
472	}
473
474	*inptrp = inptr;
475	*outptrp = outptr;
476
477	/ Determine the status. /
478	if (*inptrp == inend)
479	result = __GCONV_EMPTY_INPUT;
480	else if (*inptrp + `4` > inend)
481	result = __GCONV_INCOMPLETE_INPUT;
482	else
483	{
484	assert (*outptrp + `4` > outend);
485	result = __GCONV_FULL_OUTPUT;
486	}
487
488	return result;
489	}
490
491
492	static inline int
493	__attribute ((always_inline))
494	ucs4le_internal_loop_single (struct __gconv_step *step,
495	struct __gconv_step_data *step_data,
496	const unsigned char **inptrp,
497	const unsigned char *inend,
498	unsigned char **outptrp,
499	const unsigned char *outend,
500	size_t *irreversible)
501	{
502	mbstate_t *state = step_data->__statep;
503	int flags = step_data->__flags;
504	size_t cnt = state->__count & `7`;
505
506	while (*inptrp < inend && cnt < `4`)
507	state->__value.__wchb[cnt++] = (inptrp)++;
508
509	if (__glibc_unlikely (cnt < `4`))
510	{
511	/ Still not enough bytes. Store the ones in the input buffer. /
512	state->__count &= ~`7`;
513	state->__count \|= cnt;
514
515	return __GCONV_INCOMPLETE_INPUT;
516	}
517
518	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`3`] > `0x80`,
519	`0`))
520	{
521	/ The value is too large. We don't try transliteration here since*
522	this is not an error because of the lack of possibilities to
523	represent the result. This is a genuine bug in the input since
524	UCS4 does not allow such values. /*
525	if (!(flags & __GCONV_IGNORE_ERRORS))
526	return __GCONV_ILLEGAL_INPUT;
527	}
528	else
529	{
530	#if __BYTE_ORDER == __BIG_ENDIAN
531	(*outptrp)[`0`] = state->__value.__wchb[`3`];
532	(*outptrp)[`1`] = state->__value.__wchb[`2`];
533	(*outptrp)[`2`] = state->__value.__wchb[`1`];
534	(*outptrp)[`3`] = state->__value.__wchb[`0`];
535	#else
536	(*outptrp)[`0`] = state->__value.__wchb[`0`];
537	(*outptrp)[`1`] = state->__value.__wchb[`1`];
538	(*outptrp)[`2`] = state->__value.__wchb[`2`];
539	(*outptrp)[`3`] = state->__value.__wchb[`3`];
540	#endif
541
542	*outptrp += `4`;
543	}
544
545	/ Clear the state buffer. /
546	state->__count &= ~`7`;
547
548	return __GCONV_OK;
549	}
550
551	#include <iconv/skeleton.c>
552
553
554	/ Convert from ISO 646-IRV to the internal (UCS4-like) format. /
555	#define DEFINE_INIT 0
556	#define DEFINE_FINI 0
557	#define MIN_NEEDED_FROM 1
558	#define MIN_NEEDED_TO 4
559	#define FROM_DIRECTION 1
560	#define FROM_LOOP ascii_internal_loop
561	#define TO_LOOP ascii_internal_loop /* This is not used. */
562	#define FUNCTION_NAME __gconv_transform_ascii_internal
563	#define ONE_DIRECTION 1
564
565	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
566	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
567	#define LOOPFCT FROM_LOOP
568	#define BODY \
569	{ \
570	if (__glibc_unlikely (*inptr > '\x7f')) \
571	{ \
572	/* The value is too large. We don't try transliteration here since \
573	this is not an error because of the lack of possibilities to \
574	represent the result. This is a genuine bug in the input since \
575	ASCII does not allow such values. */ \
576	STANDARD_FROM_LOOP_ERR_HANDLER (1); \
577	} \
578	else \
579	{ \
580	/* It's an one byte sequence. */ \
581	((uint32_t ) outptr) = *inptr++; \
582	outptr += sizeof (uint32_t); \
583	} \
584	}
585	#define LOOP_NEED_FLAGS
586	#include <iconv/loop.c>
587	#include <iconv/skeleton.c>
588
589
590	/ Convert from the internal (UCS4-like) format to ISO 646-IRV. /
591	#define DEFINE_INIT 0
592	#define DEFINE_FINI 0
593	#define MIN_NEEDED_FROM 4
594	#define MIN_NEEDED_TO 1
595	#define FROM_DIRECTION 1
596	#define FROM_LOOP internal_ascii_loop
597	#define TO_LOOP internal_ascii_loop /* This is not used. */
598	#define FUNCTION_NAME __gconv_transform_internal_ascii
599	#define ONE_DIRECTION 1
600
601	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
602	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
603	#define LOOPFCT FROM_LOOP
604	#define BODY \
605	{ \
606	if (__glibc_unlikely (((const uint32_t ) inptr) > 0x7f)) \
607	{ \
608	UNICODE_TAG_HANDLER (((const uint32_t ) inptr), 4); \
609	STANDARD_TO_LOOP_ERR_HANDLER (4); \
610	} \
611	else \
612	{ \
613	/* It's an one byte sequence. */ \
614	outptr++ = ((const uint32_t *) inptr); \
615	inptr += sizeof (uint32_t); \
616	} \
617	}
618	#define LOOP_NEED_FLAGS
619	#include <iconv/loop.c>
620	#include <iconv/skeleton.c>
621
622
623	/ Convert from the internal (UCS4-like) format to UTF-8. /
624	#define DEFINE_INIT 0
625	#define DEFINE_FINI 0
626	#define MIN_NEEDED_FROM 4
627	#define MIN_NEEDED_TO 1
628	#define MAX_NEEDED_TO 6
629	#define FROM_DIRECTION 1
630	#define FROM_LOOP internal_utf8_loop
631	#define TO_LOOP internal_utf8_loop /* This is not used. */
632	#define FUNCTION_NAME __gconv_transform_internal_utf8
633	#define ONE_DIRECTION 1
634
635	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
636	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
637	#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
638	#define LOOPFCT FROM_LOOP
639	#define BODY \
640	{ \
641	uint32_t wc = ((const uint32_t ) inptr); \
642	\
643	if (__glibc_likely (wc < 0x80)) \
644	/* It's an one byte sequence. */ \
645	*outptr++ = (unsigned char) wc; \
646	else if (__glibc_likely (wc <= 0x7fffffff \
647	&& (wc < 0xd800 \|\| wc > 0xdfff))) \
648	{ \
649	size_t step; \
650	unsigned char *start; \
651	\
652	for (step = 2; step < 6; ++step) \
653	if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
654	break; \
655	\
656	if (__glibc_unlikely (outptr + step > outend)) \
657	{ \
658	/* Too long. */ \
659	result = __GCONV_FULL_OUTPUT; \
660	break; \
661	} \
662	\
663	start = outptr; \
664	*outptr = (unsigned char) (~0xff >> step); \
665	outptr += step; \
666	do \
667	{ \
668	start[--step] = 0x80 \| (wc & 0x3f); \
669	wc >>= 6; \
670	} \
671	while (step > 1); \
672	start[0] \|= wc; \
673	} \
674	else \
675	{ \
676	STANDARD_TO_LOOP_ERR_HANDLER (4); \
677	} \
678	\
679	inptr += 4; \
680	}
681	#define LOOP_NEED_FLAGS
682	#include <iconv/loop.c>
683	#include <iconv/skeleton.c>
684
685
686	/ Convert from UTF-8 to the internal (UCS4-like) format. /
687	#define DEFINE_INIT 0
688	#define DEFINE_FINI 0
689	#define MIN_NEEDED_FROM 1
690	#define MAX_NEEDED_FROM 6
691	#define MIN_NEEDED_TO 4
692	#define FROM_DIRECTION 1
693	#define FROM_LOOP utf8_internal_loop
694	#define TO_LOOP utf8_internal_loop /* This is not used. */
695	#define FUNCTION_NAME __gconv_transform_utf8_internal
696	#define ONE_DIRECTION 1
697
698	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
699	#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
700	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
701	#define LOOPFCT FROM_LOOP
702	#define BODY \
703	{ \
704	/* Next input byte. */ \
705	uint32_t ch = *inptr; \
706	\
707	if (__glibc_likely (ch < 0x80)) \
708	{ \
709	/* One byte sequence. */ \
710	++inptr; \
711	} \
712	else \
713	{ \
714	unsigned int cnt; \
715	unsigned int i; \
716	\
717	if (ch >= 0xc2 && ch < 0xe0) \
718	{ \
719	/* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
720	otherwise the wide character could have been represented \
721	using a single byte. */ \
722	cnt = 2; \
723	ch &= 0x1f; \
724	} \
725	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
726	{ \
727	/* We expect three bytes. */ \
728	cnt = 3; \
729	ch &= 0x0f; \
730	} \
731	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
732	{ \
733	/* We expect four bytes. */ \
734	cnt = 4; \
735	ch &= 0x07; \
736	} \
737	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
738	{ \
739	/* We expect five bytes. */ \
740	cnt = 5; \
741	ch &= 0x03; \
742	} \
743	else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
744	{ \
745	/* We expect six bytes. */ \
746	cnt = 6; \
747	ch &= 0x01; \
748	} \
749	else \
750	{ \
751	/* Search the end of this ill-formed UTF-8 character. This \
752	is the next byte with (x & 0xc0) != 0x80. */ \
753	i = 0; \
754	do \
755	++i; \
756	while (inptr + i < inend \
757	&& (*(inptr + i) & 0xc0) == 0x80 \
758	&& i < 5); \
759	\
760	errout: \
761	STANDARD_FROM_LOOP_ERR_HANDLER (i); \
762	} \
763	\
764	if (__glibc_unlikely (inptr + cnt > inend)) \
765	{ \
766	/* We don't have enough input. But before we report that check \
767	that all the bytes are correct. */ \
768	for (i = 1; inptr + i < inend; ++i) \
769	if ((inptr[i] & 0xc0) != 0x80) \
770	break; \
771	\
772	if (__glibc_likely (inptr + i == inend)) \
773	{ \
774	result = __GCONV_INCOMPLETE_INPUT; \
775	break; \
776	} \
777	\
778	goto errout; \
779	} \
780	\
781	/* Read the possible remaining bytes. */ \
782	for (i = 1; i < cnt; ++i) \
783	{ \
784	uint32_t byte = inptr[i]; \
785	\
786	if ((byte & 0xc0) != 0x80) \
787	/* This is an illegal encoding. */ \
788	break; \
789	\
790	ch <<= 6; \
791	ch \|= byte & 0x3f; \
792	} \
793	\
794	/* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
795	If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
796	have been represented with fewer than cnt bytes. */ \
797	if (i < cnt \|\| (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
798	/* Do not accept UTF-16 surrogates. */ \
799	\|\| (ch >= 0xd800 && ch <= 0xdfff)) \
800	{ \
801	/* This is an illegal encoding. */ \
802	goto errout; \
803	} \
804	\
805	inptr += cnt; \
806	} \
807	\
808	/* Now adjust the pointers and store the result. */ \
809	((uint32_t ) outptr) = ch; \
810	outptr += sizeof (uint32_t); \
811	}
812	#define LOOP_NEED_FLAGS
813
814	#define STORE_REST \
815	{ \
816	/* We store the remaining bytes while converting them into the UCS4 \
817	format. We can assume that the first byte in the buffer is \
818	correct and that it requires a larger number of bytes than there \
819	are in the input buffer. */ \
820	wint_t ch = **inptrp; \
821	size_t cnt, r; \
822	\
823	state->__count = inend - *inptrp; \
824	\
825	assert (ch != 0xc0 && ch != 0xc1); \
826	if (ch >= 0xc2 && ch < 0xe0) \
827	{ \
828	/* We expect two bytes. The first byte cannot be 0xc0 or \
829	0xc1, otherwise the wide character could have been \
830	represented using a single byte. */ \
831	cnt = 2; \
832	ch &= 0x1f; \
833	} \
834	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
835	{ \
836	/* We expect three bytes. */ \
837	cnt = 3; \
838	ch &= 0x0f; \
839	} \
840	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
841	{ \
842	/* We expect four bytes. */ \
843	cnt = 4; \
844	ch &= 0x07; \
845	} \
846	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
847	{ \
848	/* We expect five bytes. */ \
849	cnt = 5; \
850	ch &= 0x03; \
851	} \
852	else \
853	{ \
854	/* We expect six bytes. */ \
855	cnt = 6; \
856	ch &= 0x01; \
857	} \
858	\
859	/* The first byte is already consumed. */ \
860	r = cnt - 1; \
861	while (++(*inptrp) < inend) \
862	{ \
863	ch <<= 6; \
864	ch \|= **inptrp & 0x3f; \
865	--r; \
866	} \
867	\
868	/* Shift for the so far missing bytes. */ \
869	ch <<= r * 6; \
870	\
871	/* Store the number of bytes expected for the entire sequence. */ \
872	state->__count \|= cnt << 8; \
873	\
874	/* Store the value. */ \
875	state->__value.__wch = ch; \
876	}
877
878	#define UNPACK_BYTES \
879	{ \
880	static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
881	wint_t wch = state->__value.__wch; \
882	size_t ntotal = state->__count >> 8; \
883	\
884	inlen = state->__count & 255; \
885	\
886	bytebuf[0] = inmask[ntotal - 2]; \
887	\
888	do \
889	{ \
890	if (--ntotal < inlen) \
891	bytebuf[ntotal] = 0x80 \| (wch & 0x3f); \
892	wch >>= 6; \
893	} \
894	while (ntotal > 1); \
895	\
896	bytebuf[0] \|= wch; \
897	}
898
899	#define CLEAR_STATE \
900	state->__count = 0
901
902
903	#include <iconv/loop.c>
904	#include <iconv/skeleton.c>
905
906
907	/ Convert from UCS2 to the internal (UCS4-like) format. /
908	#define DEFINE_INIT 0
909	#define DEFINE_FINI 0
910	#define MIN_NEEDED_FROM 2
911	#define MIN_NEEDED_TO 4
912	#define FROM_DIRECTION 1
913	#define FROM_LOOP ucs2_internal_loop
914	#define TO_LOOP ucs2_internal_loop /* This is not used. */
915	#define FUNCTION_NAME __gconv_transform_ucs2_internal
916	#define ONE_DIRECTION 1
917
918	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
919	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
920	#define LOOPFCT FROM_LOOP
921	#define BODY \
922	{ \
923	uint16_t u1 = get16 (inptr); \
924	\
925	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
926	{ \
927	/* Surrogate characters in UCS-2 input are not valid. Reject \
928	them. (Catching this here is not security relevant.) */ \
929	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
930	} \
931	\
932	((uint32_t ) outptr) = u1; \
933	outptr += sizeof (uint32_t); \
934	inptr += 2; \
935	}
936	#define LOOP_NEED_FLAGS
937	#include <iconv/loop.c>
938	#include <iconv/skeleton.c>
939
940
941	/ Convert from the internal (UCS4-like) format to UCS2. /
942	#define DEFINE_INIT 0
943	#define DEFINE_FINI 0
944	#define MIN_NEEDED_FROM 4
945	#define MIN_NEEDED_TO 2
946	#define FROM_DIRECTION 1
947	#define FROM_LOOP internal_ucs2_loop
948	#define TO_LOOP internal_ucs2_loop /* This is not used. */
949	#define FUNCTION_NAME __gconv_transform_internal_ucs2
950	#define ONE_DIRECTION 1
951
952	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
953	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
954	#define LOOPFCT FROM_LOOP
955	#define BODY \
956	{ \
957	uint32_t val = ((const uint32_t ) inptr); \
958	\
959	if (__glibc_unlikely (val >= 0x10000)) \
960	{ \
961	UNICODE_TAG_HANDLER (val, 4); \
962	STANDARD_TO_LOOP_ERR_HANDLER (4); \
963	} \
964	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
965	{ \
966	/* Surrogate characters in UCS-4 input are not valid. \
967	We must catch this, because the UCS-2 output might be \
968	interpreted as UTF-16 by other programs. If we let \
969	surrogates pass through, attackers could make a security \
970	hole exploit by synthesizing any desired plane 1-16 \
971	character. */ \
972	result = __GCONV_ILLEGAL_INPUT; \
973	if (! ignore_errors_p ()) \
974	break; \
975	inptr += 4; \
976	++*irreversible; \
977	continue; \
978	} \
979	else \
980	{ \
981	put16 (outptr, val); \
982	outptr += sizeof (uint16_t); \
983	inptr += 4; \
984	} \
985	}
986	#define LOOP_NEED_FLAGS
987	#include <iconv/loop.c>
988	#include <iconv/skeleton.c>
989
990
991	/ Convert from UCS2 in other endianness to the internal (UCS4-like) format. /
992	#define DEFINE_INIT 0
993	#define DEFINE_FINI 0
994	#define MIN_NEEDED_FROM 2
995	#define MIN_NEEDED_TO 4
996	#define FROM_DIRECTION 1
997	#define FROM_LOOP ucs2reverse_internal_loop
998	#define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
999	#define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
1000	#define ONE_DIRECTION 1
1001
1002	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1003	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1004	#define LOOPFCT FROM_LOOP
1005	#define BODY \
1006	{ \
1007	uint16_t u1 = bswap_16 (get16 (inptr)); \
1008	\
1009	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1010	{ \
1011	/* Surrogate characters in UCS-2 input are not valid. Reject \
1012	them. (Catching this here is not security relevant.) */ \
1013	if (! ignore_errors_p ()) \
1014	{ \
1015	result = __GCONV_ILLEGAL_INPUT; \
1016	break; \
1017	} \
1018	inptr += 2; \
1019	++*irreversible; \
1020	continue; \
1021	} \
1022	\
1023	((uint32_t ) outptr) = u1; \
1024	outptr += sizeof (uint32_t); \
1025	inptr += 2; \
1026	}
1027	#define LOOP_NEED_FLAGS
1028	#include <iconv/loop.c>
1029	#include <iconv/skeleton.c>
1030
1031
1032	/ Convert from the internal (UCS4-like) format to UCS2 in other endianness. /
1033	#define DEFINE_INIT 0
1034	#define DEFINE_FINI 0
1035	#define MIN_NEEDED_FROM 4
1036	#define MIN_NEEDED_TO 2
1037	#define FROM_DIRECTION 1
1038	#define FROM_LOOP internal_ucs2reverse_loop
1039	#define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
1040	#define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
1041	#define ONE_DIRECTION 1
1042
1043	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1044	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1045	#define LOOPFCT FROM_LOOP
1046	#define BODY \
1047	{ \
1048	uint32_t val = ((const uint32_t ) inptr); \
1049	if (__glibc_unlikely (val >= 0x10000)) \
1050	{ \
1051	UNICODE_TAG_HANDLER (val, 4); \
1052	STANDARD_TO_LOOP_ERR_HANDLER (4); \
1053	} \
1054	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1055	{ \
1056	/* Surrogate characters in UCS-4 input are not valid. \
1057	We must catch this, because the UCS-2 output might be \
1058	interpreted as UTF-16 by other programs. If we let \
1059	surrogates pass through, attackers could make a security \
1060	hole exploit by synthesizing any desired plane 1-16 \
1061	character. */ \
1062	if (! ignore_errors_p ()) \
1063	{ \
1064	result = __GCONV_ILLEGAL_INPUT; \
1065	break; \
1066	} \
1067	inptr += 4; \
1068	++*irreversible; \
1069	continue; \
1070	} \
1071	else \
1072	{ \
1073	put16 (outptr, bswap_16 (val)); \
1074	outptr += sizeof (uint16_t); \
1075	inptr += 4; \
1076	} \
1077	}
1078	#define LOOP_NEED_FLAGS
1079	#include <iconv/loop.c>
1080	#include <iconv/skeleton.c>
1081

Browse the source code of glibc/iconv/gconv_simple.c