gconv_simple.c source code [glibc/iconv/gconv_simple.c]

1	/ Simple transformations functions.*
2	Copyright (C) 1997-2019 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<http://www.gnu.org/licenses/>. /*
19
20	#include <byteswap.h>
21	#include <dlfcn.h>
22	#include <endian.h>
23	#include <errno.h>
24	#include <gconv.h>
25	#include <stdint.h>
26	#include <stdlib.h>
27	#include <string.h>
28	#include <wchar.h>
29	#include <sys/param.h>
30	#include <gconv_int.h>
31
32	#define BUILTIN_ALIAS(s1, s2) /* nothing */
33	#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
34	MinF, MaxF, MinT, MaxT) \
35	extern int Fct (struct __gconv_step , struct __gconv_step_data , \
36	const unsigned char *, const unsigned char , \
37	unsigned char *, size_t , int, int);
38	#include "gconv_builtin.h"
39
40
41	#ifndef EILSEQ
42	# define EILSEQ EINVAL
43	#endif
44
45
46	/ Specialized conversion function for a single byte to INTERNAL, recognizing*
47	only ASCII characters. /*
48	wint_t
49	__gconv_btwoc_ascii (struct __gconv_step step, unsigned* char c)
50	{
51	if (c < `0x80`)
52	return c;
53	else
54	return WEOF;
55	}
56
57
58	/ Transform from the internal, UCS4-like format, to UCS4. The*
59	difference between the internal ucs4 format and the real UCS4
60	format is, if any, the endianess. The Unicode/ISO 10646 says that
61	unless some higher protocol specifies it differently, the byte
62	order is big endian./*
63	#define DEFINE_INIT 0
64	#define DEFINE_FINI 0
65	#define MIN_NEEDED_FROM 4
66	#define MIN_NEEDED_TO 4
67	#define FROM_DIRECTION 1
68	#define FROM_LOOP internal_ucs4_loop
69	#define TO_LOOP internal_ucs4_loop /* This is not used. */
70	#define FUNCTION_NAME __gconv_transform_internal_ucs4
71	#define ONE_DIRECTION 0
72
73
74	static inline int
75	__attribute ((always_inline))
76	internal_ucs4_loop (struct __gconv_step *step,
77	struct __gconv_step_data *step_data,
78	const unsigned char *inptrp, const* unsigned char *inend,
79	unsigned char *outptrp, unsigned* char *outend,
80	size_t *irreversible)
81	{
82	const unsigned char inptr = inptrp;
83	unsigned char outptr = outptrp;
84	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
85	int result;
86
87	#if __BYTE_ORDER == __LITTLE_ENDIAN
88	/ Sigh, we have to do some real work. /
89	size_t cnt;
90	uint32_t outptr32 = (uint32_t ) outptr;
91
92	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
93	outptr32++ = bswap_32 ((const uint32_t *) inptr);
94
95	*inptrp = inptr;
96	outptrp = (unsigned* char *) outptr32;
97	#elif __BYTE_ORDER == __BIG_ENDIAN
98	/ Simply copy the data. /
99	inptrp = inptr + n_convert `4`;
100	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
101	#else
102	# error "This endianess is not supported."
103	#endif
104
105	/ Determine the status. /
106	if (*inptrp == inend)
107	result = __GCONV_EMPTY_INPUT;
108	else if (*outptrp + `4` > outend)
109	result = __GCONV_FULL_OUTPUT;
110	else
111	result = __GCONV_INCOMPLETE_INPUT;
112
113	return result;
114	}
115
116	#if !_STRING_ARCH_unaligned
117	static inline int
118	__attribute ((always_inline))
119	internal_ucs4_loop_unaligned (struct __gconv_step *step,
120	struct __gconv_step_data *step_data,
121	const unsigned char **inptrp,
122	const unsigned char *inend,
123	unsigned char *outptrp, unsigned* char *outend,
124	size_t *irreversible)
125	{
126	const unsigned char inptr = inptrp;
127	unsigned char outptr = outptrp;
128	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
129	int result;
130
131	# if __BYTE_ORDER == __LITTLE_ENDIAN
132	/ Sigh, we have to do some real work. /
133	size_t cnt;
134
135	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
136	{
137	outptr[`0`] = inptr[`3`];
138	outptr[`1`] = inptr[`2`];
139	outptr[`2`] = inptr[`1`];
140	outptr[`3`] = inptr[`0`];
141	}
142
143	*inptrp = inptr;
144	*outptrp = outptr;
145	# elif __BYTE_ORDER == __BIG_ENDIAN
146	/ Simply copy the data. /
147	inptrp = inptr + n_convert `4`;
148	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
149	# else
150	# error "This endianess is not supported."
151	# endif
152
153	/ Determine the status. /
154	if (*inptrp == inend)
155	result = __GCONV_EMPTY_INPUT;
156	else if (*outptrp + `4` > outend)
157	result = __GCONV_FULL_OUTPUT;
158	else
159	result = __GCONV_INCOMPLETE_INPUT;
160
161	return result;
162	}
163	#endif
164
165
166	static inline int
167	__attribute ((always_inline))
168	internal_ucs4_loop_single (struct __gconv_step *step,
169	struct __gconv_step_data *step_data,
170	const unsigned char **inptrp,
171	const unsigned char *inend,
172	unsigned char *outptrp, unsigned* char *outend,
173	size_t *irreversible)
174	{
175	mbstate_t *state = step_data->__statep;
176	size_t cnt = state->__count & `7`;
177
178	while (*inptrp < inend && cnt < `4`)
179	state->__value.__wchb[cnt++] = (inptrp)++;
180
181	if (__glibc_unlikely (cnt < `4`))
182	{
183	/ Still not enough bytes. Store the ones in the input buffer. /
184	state->__count &= ~`7`;
185	state->__count \|= cnt;
186
187	return __GCONV_INCOMPLETE_INPUT;
188	}
189
190	#if __BYTE_ORDER == __LITTLE_ENDIAN
191	(*outptrp)[`0`] = state->__value.__wchb[`3`];
192	(*outptrp)[`1`] = state->__value.__wchb[`2`];
193	(*outptrp)[`2`] = state->__value.__wchb[`1`];
194	(*outptrp)[`3`] = state->__value.__wchb[`0`];
195
196	#elif __BYTE_ORDER == __BIG_ENDIAN
197	/ XXX unaligned /
198	(*outptrp)[`0`] = state->__value.__wchb[`0`];
199	(*outptrp)[`1`] = state->__value.__wchb[`1`];
200	(*outptrp)[`2`] = state->__value.__wchb[`2`];
201	(*outptrp)[`3`] = state->__value.__wchb[`3`];
202	#else
203	# error "This endianess is not supported."
204	#endif
205	*outptrp += `4`;
206
207	/ Clear the state buffer. /
208	state->__count &= ~`7`;
209
210	return __GCONV_OK;
211	}
212
213	#include <iconv/skeleton.c>
214
215
216	/ Transform from UCS4 to the internal, UCS4-like format. Unlike*
217	for the other direction we have to check for correct values here. /*
218	#define DEFINE_INIT 0
219	#define DEFINE_FINI 0
220	#define MIN_NEEDED_FROM 4
221	#define MIN_NEEDED_TO 4
222	#define FROM_DIRECTION 1
223	#define FROM_LOOP ucs4_internal_loop
224	#define TO_LOOP ucs4_internal_loop /* This is not used. */
225	#define FUNCTION_NAME __gconv_transform_ucs4_internal
226	#define ONE_DIRECTION 0
227
228
229	static inline int
230	__attribute ((always_inline))
231	ucs4_internal_loop (struct __gconv_step *step,
232	struct __gconv_step_data *step_data,
233	const unsigned char *inptrp, const* unsigned char *inend,
234	unsigned char *outptrp, unsigned* char *outend,
235	size_t *irreversible)
236	{
237	int flags = step_data->__flags;
238	const unsigned char inptr = inptrp;
239	unsigned char outptr = outptrp;
240	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
241	int result;
242	size_t cnt;
243
244	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
245	{
246	uint32_t inval;
247
248	#if __BYTE_ORDER == __LITTLE_ENDIAN
249	inval = bswap_32 ((const* uint32_t *) inptr);
250	#else
251	inval = (const* uint32_t *) inptr;
252	#endif
253
254	if (__glibc_unlikely (inval > `0x7fffffff`))
255	{
256	/ The value is too large. We don't try transliteration here since*
257	this is not an error because of the lack of possibilities to
258	represent the result. This is a genuine bug in the input since
259	UCS4 does not allow such values. /*
260	if (irreversible == NULL)
261	/ We are transliterating, don't try to correct anything. /
262	return __GCONV_ILLEGAL_INPUT;
263
264	if (flags & __GCONV_IGNORE_ERRORS)
265	{
266	/ Just ignore this character. /
267	++*irreversible;
268	continue;
269	}
270
271	*inptrp = inptr;
272	*outptrp = outptr;
273	return __GCONV_ILLEGAL_INPUT;
274	}
275
276	((uint32_t ) outptr) = inval;
277	outptr += sizeof (uint32_t);
278	}
279
280	*inptrp = inptr;
281	*outptrp = outptr;
282
283	/ Determine the status. /
284	if (*inptrp == inend)
285	result = __GCONV_EMPTY_INPUT;
286	else if (*outptrp + `4` > outend)
287	result = __GCONV_FULL_OUTPUT;
288	else
289	result = __GCONV_INCOMPLETE_INPUT;
290
291	return result;
292	}
293
294	#if !_STRING_ARCH_unaligned
295	static inline int
296	__attribute ((always_inline))
297	ucs4_internal_loop_unaligned (struct __gconv_step *step,
298	struct __gconv_step_data *step_data,
299	const unsigned char **inptrp,
300	const unsigned char *inend,
301	unsigned char *outptrp, unsigned* char *outend,
302	size_t *irreversible)
303	{
304	int flags = step_data->__flags;
305	const unsigned char inptr = inptrp;
306	unsigned char outptr = outptrp;
307	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
308	int result;
309	size_t cnt;
310
311	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
312	{
313	if (__glibc_unlikely (inptr[`0`] > `0x80`))
314	{
315	/ The value is too large. We don't try transliteration here since*
316	this is not an error because of the lack of possibilities to
317	represent the result. This is a genuine bug in the input since
318	UCS4 does not allow such values. /*
319	if (irreversible == NULL)
320	/ We are transliterating, don't try to correct anything. /
321	return __GCONV_ILLEGAL_INPUT;
322
323	if (flags & __GCONV_IGNORE_ERRORS)
324	{
325	/ Just ignore this character. /
326	++*irreversible;
327	continue;
328	}
329
330	*inptrp = inptr;
331	*outptrp = outptr;
332	return __GCONV_ILLEGAL_INPUT;
333	}
334
335	# if __BYTE_ORDER == __LITTLE_ENDIAN
336	outptr[`3`] = inptr[`0`];
337	outptr[`2`] = inptr[`1`];
338	outptr[`1`] = inptr[`2`];
339	outptr[`0`] = inptr[`3`];
340	# else
341	outptr[`0`] = inptr[`0`];
342	outptr[`1`] = inptr[`1`];
343	outptr[`2`] = inptr[`2`];
344	outptr[`3`] = inptr[`3`];
345	# endif
346	outptr += `4`;
347	}
348
349	*inptrp = inptr;
350	*outptrp = outptr;
351
352	/ Determine the status. /
353	if (*inptrp == inend)
354	result = __GCONV_EMPTY_INPUT;
355	else if (*outptrp + `4` > outend)
356	result = __GCONV_FULL_OUTPUT;
357	else
358	result = __GCONV_INCOMPLETE_INPUT;
359
360	return result;
361	}
362	#endif
363
364
365	static inline int
366	__attribute ((always_inline))
367	ucs4_internal_loop_single (struct __gconv_step *step,
368	struct __gconv_step_data *step_data,
369	const unsigned char **inptrp,
370	const unsigned char *inend,
371	unsigned char *outptrp, unsigned* char *outend,
372	size_t *irreversible)
373	{
374	mbstate_t *state = step_data->__statep;
375	int flags = step_data->__flags;
376	size_t cnt = state->__count & `7`;
377
378	while (*inptrp < inend && cnt < `4`)
379	state->__value.__wchb[cnt++] = (inptrp)++;
380
381	if (__glibc_unlikely (cnt < `4`))
382	{
383	/ Still not enough bytes. Store the ones in the input buffer. /
384	state->__count &= ~`7`;
385	state->__count \|= cnt;
386
387	return __GCONV_INCOMPLETE_INPUT;
388	}
389
390	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`0`] > `0x80`,
391	`0`))
392	{
393	/ The value is too large. We don't try transliteration here since*
394	this is not an error because of the lack of possibilities to
395	represent the result. This is a genuine bug in the input since
396	UCS4 does not allow such values. /*
397	if (!(flags & __GCONV_IGNORE_ERRORS))
398	{
399	*inptrp -= cnt - (state->__count & `7`);
400	return __GCONV_ILLEGAL_INPUT;
401	}
402	}
403	else
404	{
405	#if __BYTE_ORDER == __LITTLE_ENDIAN
406	(*outptrp)[`0`] = state->__value.__wchb[`3`];
407	(*outptrp)[`1`] = state->__value.__wchb[`2`];
408	(*outptrp)[`2`] = state->__value.__wchb[`1`];
409	(*outptrp)[`3`] = state->__value.__wchb[`0`];
410	#elif __BYTE_ORDER == __BIG_ENDIAN
411	(*outptrp)[`0`] = state->__value.__wchb[`0`];
412	(*outptrp)[`1`] = state->__value.__wchb[`1`];
413	(*outptrp)[`2`] = state->__value.__wchb[`2`];
414	(*outptrp)[`3`] = state->__value.__wchb[`3`];
415	#endif
416
417	*outptrp += `4`;
418	}
419
420	/ Clear the state buffer. /
421	state->__count &= ~`7`;
422
423	return __GCONV_OK;
424	}
425
426	#include <iconv/skeleton.c>
427
428
429	/ Similarly for the little endian form. /
430	#define DEFINE_INIT 0
431	#define DEFINE_FINI 0
432	#define MIN_NEEDED_FROM 4
433	#define MIN_NEEDED_TO 4
434	#define FROM_DIRECTION 1
435	#define FROM_LOOP internal_ucs4le_loop
436	#define TO_LOOP internal_ucs4le_loop /* This is not used. */
437	#define FUNCTION_NAME __gconv_transform_internal_ucs4le
438	#define ONE_DIRECTION 0
439
440
441	static inline int
442	__attribute ((always_inline))
443	internal_ucs4le_loop (struct __gconv_step *step,
444	struct __gconv_step_data *step_data,
445	const unsigned char *inptrp, const* unsigned char *inend,
446	unsigned char *outptrp, unsigned* char *outend,
447	size_t *irreversible)
448	{
449	const unsigned char inptr = inptrp;
450	unsigned char outptr = outptrp;
451	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
452	int result;
453
454	#if __BYTE_ORDER == __BIG_ENDIAN
455	/ Sigh, we have to do some real work. /
456	size_t cnt;
457	uint32_t outptr32 = (uint32_t ) outptr;
458
459	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
460	outptr32++ = bswap_32 ((const uint32_t *) inptr);
461	outptr = (unsigned char *) outptr32;
462
463	*inptrp = inptr;
464	*outptrp = outptr;
465	#elif __BYTE_ORDER == __LITTLE_ENDIAN
466	/ Simply copy the data. /
467	inptrp = inptr + n_convert `4`;
468	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
469	#else
470	# error "This endianess is not supported."
471	#endif
472
473	/ Determine the status. /
474	if (*inptrp == inend)
475	result = __GCONV_EMPTY_INPUT;
476	else if (*outptrp + `4` > outend)
477	result = __GCONV_FULL_OUTPUT;
478	else
479	result = __GCONV_INCOMPLETE_INPUT;
480
481	return result;
482	}
483
484	#if !_STRING_ARCH_unaligned
485	static inline int
486	__attribute ((always_inline))
487	internal_ucs4le_loop_unaligned (struct __gconv_step *step,
488	struct __gconv_step_data *step_data,
489	const unsigned char **inptrp,
490	const unsigned char *inend,
491	unsigned char *outptrp, unsigned* char *outend,
492	size_t *irreversible)
493	{
494	const unsigned char inptr = inptrp;
495	unsigned char outptr = outptrp;
496	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
497	int result;
498
499	# if __BYTE_ORDER == __BIG_ENDIAN
500	/ Sigh, we have to do some real work. /
501	size_t cnt;
502
503	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
504	{
505	outptr[`0`] = inptr[`3`];
506	outptr[`1`] = inptr[`2`];
507	outptr[`2`] = inptr[`1`];
508	outptr[`3`] = inptr[`0`];
509	}
510
511	*inptrp = inptr;
512	*outptrp = outptr;
513	# elif __BYTE_ORDER == __LITTLE_ENDIAN
514	/ Simply copy the data. /
515	inptrp = inptr + n_convert `4`;
516	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
517	# else
518	# error "This endianess is not supported."
519	# endif
520
521	/ Determine the status. /
522	if (*inptrp == inend)
523	result = __GCONV_EMPTY_INPUT;
524	else if (*inptrp + `4` > inend)
525	result = __GCONV_INCOMPLETE_INPUT;
526	else
527	{
528	assert (*outptrp + `4` > outend);
529	result = __GCONV_FULL_OUTPUT;
530	}
531
532	return result;
533	}
534	#endif
535
536
537	static inline int
538	__attribute ((always_inline))
539	internal_ucs4le_loop_single (struct __gconv_step *step,
540	struct __gconv_step_data *step_data,
541	const unsigned char **inptrp,
542	const unsigned char *inend,
543	unsigned char *outptrp, unsigned* char *outend,
544	size_t *irreversible)
545	{
546	mbstate_t *state = step_data->__statep;
547	size_t cnt = state->__count & `7`;
548
549	while (*inptrp < inend && cnt < `4`)
550	state->__value.__wchb[cnt++] = (inptrp)++;
551
552	if (__glibc_unlikely (cnt < `4`))
553	{
554	/ Still not enough bytes. Store the ones in the input buffer. /
555	state->__count &= ~`7`;
556	state->__count \|= cnt;
557
558	return __GCONV_INCOMPLETE_INPUT;
559	}
560
561	#if __BYTE_ORDER == __BIG_ENDIAN
562	(*outptrp)[`0`] = state->__value.__wchb[`3`];
563	(*outptrp)[`1`] = state->__value.__wchb[`2`];
564	(*outptrp)[`2`] = state->__value.__wchb[`1`];
565	(*outptrp)[`3`] = state->__value.__wchb[`0`];
566
567	#else
568	/ XXX unaligned /
569	(*outptrp)[`0`] = state->__value.__wchb[`0`];
570	(*outptrp)[`1`] = state->__value.__wchb[`1`];
571	(*outptrp)[`2`] = state->__value.__wchb[`2`];
572	(*outptrp)[`3`] = state->__value.__wchb[`3`];
573
574	#endif
575
576	*outptrp += `4`;
577
578	/ Clear the state buffer. /
579	state->__count &= ~`7`;
580
581	return __GCONV_OK;
582	}
583
584	#include <iconv/skeleton.c>
585
586
587	/ And finally from UCS4-LE to the internal encoding. /
588	#define DEFINE_INIT 0
589	#define DEFINE_FINI 0
590	#define MIN_NEEDED_FROM 4
591	#define MIN_NEEDED_TO 4
592	#define FROM_DIRECTION 1
593	#define FROM_LOOP ucs4le_internal_loop
594	#define TO_LOOP ucs4le_internal_loop /* This is not used. */
595	#define FUNCTION_NAME __gconv_transform_ucs4le_internal
596	#define ONE_DIRECTION 0
597
598
599	static inline int
600	__attribute ((always_inline))
601	ucs4le_internal_loop (struct __gconv_step *step,
602	struct __gconv_step_data *step_data,
603	const unsigned char *inptrp, const* unsigned char *inend,
604	unsigned char *outptrp, unsigned* char *outend,
605	size_t *irreversible)
606	{
607	int flags = step_data->__flags;
608	const unsigned char inptr = inptrp;
609	unsigned char outptr = outptrp;
610	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
611	int result;
612	size_t cnt;
613
614	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
615	{
616	uint32_t inval;
617
618	#if __BYTE_ORDER == __BIG_ENDIAN
619	inval = bswap_32 ((const* uint32_t *) inptr);
620	#else
621	inval = (const* uint32_t *) inptr;
622	#endif
623
624	if (__glibc_unlikely (inval > `0x7fffffff`))
625	{
626	/ The value is too large. We don't try transliteration here since*
627	this is not an error because of the lack of possibilities to
628	represent the result. This is a genuine bug in the input since
629	UCS4 does not allow such values. /*
630	if (irreversible == NULL)
631	/ We are transliterating, don't try to correct anything. /
632	return __GCONV_ILLEGAL_INPUT;
633
634	if (flags & __GCONV_IGNORE_ERRORS)
635	{
636	/ Just ignore this character. /
637	++*irreversible;
638	continue;
639	}
640
641	*inptrp = inptr;
642	*outptrp = outptr;
643	return __GCONV_ILLEGAL_INPUT;
644	}
645
646	((uint32_t ) outptr) = inval;
647	outptr += sizeof (uint32_t);
648	}
649
650	*inptrp = inptr;
651	*outptrp = outptr;
652
653	/ Determine the status. /
654	if (*inptrp == inend)
655	result = __GCONV_EMPTY_INPUT;
656	else if (*inptrp + `4` > inend)
657	result = __GCONV_INCOMPLETE_INPUT;
658	else
659	{
660	assert (*outptrp + `4` > outend);
661	result = __GCONV_FULL_OUTPUT;
662	}
663
664	return result;
665	}
666
667	#if !_STRING_ARCH_unaligned
668	static inline int
669	__attribute ((always_inline))
670	ucs4le_internal_loop_unaligned (struct __gconv_step *step,
671	struct __gconv_step_data *step_data,
672	const unsigned char **inptrp,
673	const unsigned char *inend,
674	unsigned char *outptrp, unsigned* char *outend,
675	size_t *irreversible)
676	{
677	int flags = step_data->__flags;
678	const unsigned char inptr = inptrp;
679	unsigned char outptr = outptrp;
680	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
681	int result;
682	size_t cnt;
683
684	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
685	{
686	if (__glibc_unlikely (inptr[`3`] > `0x80`))
687	{
688	/ The value is too large. We don't try transliteration here since*
689	this is not an error because of the lack of possibilities to
690	represent the result. This is a genuine bug in the input since
691	UCS4 does not allow such values. /*
692	if (irreversible == NULL)
693	/ We are transliterating, don't try to correct anything. /
694	return __GCONV_ILLEGAL_INPUT;
695
696	if (flags & __GCONV_IGNORE_ERRORS)
697	{
698	/ Just ignore this character. /
699	++*irreversible;
700	continue;
701	}
702
703	*inptrp = inptr;
704	*outptrp = outptr;
705	return __GCONV_ILLEGAL_INPUT;
706	}
707
708	# if __BYTE_ORDER == __BIG_ENDIAN
709	outptr[`3`] = inptr[`0`];
710	outptr[`2`] = inptr[`1`];
711	outptr[`1`] = inptr[`2`];
712	outptr[`0`] = inptr[`3`];
713	# else
714	outptr[`0`] = inptr[`0`];
715	outptr[`1`] = inptr[`1`];
716	outptr[`2`] = inptr[`2`];
717	outptr[`3`] = inptr[`3`];
718	# endif
719
720	outptr += `4`;
721	}
722
723	*inptrp = inptr;
724	*outptrp = outptr;
725
726	/ Determine the status. /
727	if (*inptrp == inend)
728	result = __GCONV_EMPTY_INPUT;
729	else if (*inptrp + `4` > inend)
730	result = __GCONV_INCOMPLETE_INPUT;
731	else
732	{
733	assert (*outptrp + `4` > outend);
734	result = __GCONV_FULL_OUTPUT;
735	}
736
737	return result;
738	}
739	#endif
740
741
742	static inline int
743	__attribute ((always_inline))
744	ucs4le_internal_loop_single (struct __gconv_step *step,
745	struct __gconv_step_data *step_data,
746	const unsigned char **inptrp,
747	const unsigned char *inend,
748	unsigned char *outptrp, unsigned* char *outend,
749	size_t *irreversible)
750	{
751	mbstate_t *state = step_data->__statep;
752	int flags = step_data->__flags;
753	size_t cnt = state->__count & `7`;
754
755	while (*inptrp < inend && cnt < `4`)
756	state->__value.__wchb[cnt++] = (inptrp)++;
757
758	if (__glibc_unlikely (cnt < `4`))
759	{
760	/ Still not enough bytes. Store the ones in the input buffer. /
761	state->__count &= ~`7`;
762	state->__count \|= cnt;
763
764	return __GCONV_INCOMPLETE_INPUT;
765	}
766
767	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`3`] > `0x80`,
768	`0`))
769	{
770	/ The value is too large. We don't try transliteration here since*
771	this is not an error because of the lack of possibilities to
772	represent the result. This is a genuine bug in the input since
773	UCS4 does not allow such values. /*
774	if (!(flags & __GCONV_IGNORE_ERRORS))
775	return __GCONV_ILLEGAL_INPUT;
776	}
777	else
778	{
779	#if __BYTE_ORDER == __BIG_ENDIAN
780	(*outptrp)[`0`] = state->__value.__wchb[`3`];
781	(*outptrp)[`1`] = state->__value.__wchb[`2`];
782	(*outptrp)[`2`] = state->__value.__wchb[`1`];
783	(*outptrp)[`3`] = state->__value.__wchb[`0`];
784	#else
785	(*outptrp)[`0`] = state->__value.__wchb[`0`];
786	(*outptrp)[`1`] = state->__value.__wchb[`1`];
787	(*outptrp)[`2`] = state->__value.__wchb[`2`];
788	(*outptrp)[`3`] = state->__value.__wchb[`3`];
789	#endif
790
791	*outptrp += `4`;
792	}
793
794	/ Clear the state buffer. /
795	state->__count &= ~`7`;
796
797	return __GCONV_OK;
798	}
799
800	#include <iconv/skeleton.c>
801
802
803	/ Convert from ISO 646-IRV to the internal (UCS4-like) format. /
804	#define DEFINE_INIT 0
805	#define DEFINE_FINI 0
806	#define MIN_NEEDED_FROM 1
807	#define MIN_NEEDED_TO 4
808	#define FROM_DIRECTION 1
809	#define FROM_LOOP ascii_internal_loop
810	#define TO_LOOP ascii_internal_loop /* This is not used. */
811	#define FUNCTION_NAME __gconv_transform_ascii_internal
812	#define ONE_DIRECTION 1
813
814	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
815	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
816	#define LOOPFCT FROM_LOOP
817	#define BODY \
818	{ \
819	if (__glibc_unlikely (*inptr > '\x7f')) \
820	{ \
821	/* The value is too large. We don't try transliteration here since \
822	this is not an error because of the lack of possibilities to \
823	represent the result. This is a genuine bug in the input since \
824	ASCII does not allow such values. */ \
825	STANDARD_FROM_LOOP_ERR_HANDLER (1); \
826	} \
827	else \
828	{ \
829	/* It's an one byte sequence. */ \
830	((uint32_t ) outptr) = *inptr++; \
831	outptr += sizeof (uint32_t); \
832	} \
833	}
834	#define LOOP_NEED_FLAGS
835	#include <iconv/loop.c>
836	#include <iconv/skeleton.c>
837
838
839	/ Convert from the internal (UCS4-like) format to ISO 646-IRV. /
840	#define DEFINE_INIT 0
841	#define DEFINE_FINI 0
842	#define MIN_NEEDED_FROM 4
843	#define MIN_NEEDED_TO 1
844	#define FROM_DIRECTION 1
845	#define FROM_LOOP internal_ascii_loop
846	#define TO_LOOP internal_ascii_loop /* This is not used. */
847	#define FUNCTION_NAME __gconv_transform_internal_ascii
848	#define ONE_DIRECTION 1
849
850	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
851	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
852	#define LOOPFCT FROM_LOOP
853	#define BODY \
854	{ \
855	if (__glibc_unlikely (((const uint32_t ) inptr) > 0x7f)) \
856	{ \
857	UNICODE_TAG_HANDLER (((const uint32_t ) inptr), 4); \
858	STANDARD_TO_LOOP_ERR_HANDLER (4); \
859	} \
860	else \
861	{ \
862	/* It's an one byte sequence. */ \
863	outptr++ = ((const uint32_t *) inptr); \
864	inptr += sizeof (uint32_t); \
865	} \
866	}
867	#define LOOP_NEED_FLAGS
868	#include <iconv/loop.c>
869	#include <iconv/skeleton.c>
870
871
872	/ Convert from the internal (UCS4-like) format to UTF-8. /
873	#define DEFINE_INIT 0
874	#define DEFINE_FINI 0
875	#define MIN_NEEDED_FROM 4
876	#define MIN_NEEDED_TO 1
877	#define MAX_NEEDED_TO 6
878	#define FROM_DIRECTION 1
879	#define FROM_LOOP internal_utf8_loop
880	#define TO_LOOP internal_utf8_loop /* This is not used. */
881	#define FUNCTION_NAME __gconv_transform_internal_utf8
882	#define ONE_DIRECTION 1
883
884	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
885	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
886	#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
887	#define LOOPFCT FROM_LOOP
888	#define BODY \
889	{ \
890	uint32_t wc = ((const uint32_t ) inptr); \
891	\
892	if (__glibc_likely (wc < 0x80)) \
893	/* It's an one byte sequence. */ \
894	*outptr++ = (unsigned char) wc; \
895	else if (__glibc_likely (wc <= 0x7fffffff \
896	&& (wc < 0xd800 \|\| wc > 0xdfff))) \
897	{ \
898	size_t step; \
899	unsigned char *start; \
900	\
901	for (step = 2; step < 6; ++step) \
902	if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
903	break; \
904	\
905	if (__glibc_unlikely (outptr + step > outend)) \
906	{ \
907	/* Too long. */ \
908	result = __GCONV_FULL_OUTPUT; \
909	break; \
910	} \
911	\
912	start = outptr; \
913	*outptr = (unsigned char) (~0xff >> step); \
914	outptr += step; \
915	do \
916	{ \
917	start[--step] = 0x80 \| (wc & 0x3f); \
918	wc >>= 6; \
919	} \
920	while (step > 1); \
921	start[0] \|= wc; \
922	} \
923	else \
924	{ \
925	STANDARD_TO_LOOP_ERR_HANDLER (4); \
926	} \
927	\
928	inptr += 4; \
929	}
930	#define LOOP_NEED_FLAGS
931	#include <iconv/loop.c>
932	#include <iconv/skeleton.c>
933
934
935	/ Convert from UTF-8 to the internal (UCS4-like) format. /
936	#define DEFINE_INIT 0
937	#define DEFINE_FINI 0
938	#define MIN_NEEDED_FROM 1
939	#define MAX_NEEDED_FROM 6
940	#define MIN_NEEDED_TO 4
941	#define FROM_DIRECTION 1
942	#define FROM_LOOP utf8_internal_loop
943	#define TO_LOOP utf8_internal_loop /* This is not used. */
944	#define FUNCTION_NAME __gconv_transform_utf8_internal
945	#define ONE_DIRECTION 1
946
947	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
948	#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
949	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
950	#define LOOPFCT FROM_LOOP
951	#define BODY \
952	{ \
953	/* Next input byte. */ \
954	uint32_t ch = *inptr; \
955	\
956	if (__glibc_likely (ch < 0x80)) \
957	{ \
958	/* One byte sequence. */ \
959	++inptr; \
960	} \
961	else \
962	{ \
963	uint_fast32_t cnt; \
964	uint_fast32_t i; \
965	\
966	if (ch >= 0xc2 && ch < 0xe0) \
967	{ \
968	/* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
969	otherwise the wide character could have been represented \
970	using a single byte. */ \
971	cnt = 2; \
972	ch &= 0x1f; \
973	} \
974	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
975	{ \
976	/* We expect three bytes. */ \
977	cnt = 3; \
978	ch &= 0x0f; \
979	} \
980	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
981	{ \
982	/* We expect four bytes. */ \
983	cnt = 4; \
984	ch &= 0x07; \
985	} \
986	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
987	{ \
988	/* We expect five bytes. */ \
989	cnt = 5; \
990	ch &= 0x03; \
991	} \
992	else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
993	{ \
994	/* We expect six bytes. */ \
995	cnt = 6; \
996	ch &= 0x01; \
997	} \
998	else \
999	{ \
1000	/* Search the end of this ill-formed UTF-8 character. This \
1001	is the next byte with (x & 0xc0) != 0x80. */ \
1002	i = 0; \
1003	do \
1004	++i; \
1005	while (inptr + i < inend \
1006	&& (*(inptr + i) & 0xc0) == 0x80 \
1007	&& i < 5); \
1008	\
1009	errout: \
1010	STANDARD_FROM_LOOP_ERR_HANDLER (i); \
1011	} \
1012	\
1013	if (__glibc_unlikely (inptr + cnt > inend)) \
1014	{ \
1015	/* We don't have enough input. But before we report that check \
1016	that all the bytes are correct. */ \
1017	for (i = 1; inptr + i < inend; ++i) \
1018	if ((inptr[i] & 0xc0) != 0x80) \
1019	break; \
1020	\
1021	if (__glibc_likely (inptr + i == inend)) \
1022	{ \
1023	result = __GCONV_INCOMPLETE_INPUT; \
1024	break; \
1025	} \
1026	\
1027	goto errout; \
1028	} \
1029	\
1030	/* Read the possible remaining bytes. */ \
1031	for (i = 1; i < cnt; ++i) \
1032	{ \
1033	uint32_t byte = inptr[i]; \
1034	\
1035	if ((byte & 0xc0) != 0x80) \
1036	/* This is an illegal encoding. */ \
1037	break; \
1038	\
1039	ch <<= 6; \
1040	ch \|= byte & 0x3f; \
1041	} \
1042	\
1043	/* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
1044	If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
1045	have been represented with fewer than cnt bytes. */ \
1046	if (i < cnt \|\| (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
1047	/* Do not accept UTF-16 surrogates. */ \
1048	\|\| (ch >= 0xd800 && ch <= 0xdfff)) \
1049	{ \
1050	/* This is an illegal encoding. */ \
1051	goto errout; \
1052	} \
1053	\
1054	inptr += cnt; \
1055	} \
1056	\
1057	/* Now adjust the pointers and store the result. */ \
1058	((uint32_t ) outptr) = ch; \
1059	outptr += sizeof (uint32_t); \
1060	}
1061	#define LOOP_NEED_FLAGS
1062
1063	#define STORE_REST \
1064	{ \
1065	/* We store the remaining bytes while converting them into the UCS4 \
1066	format. We can assume that the first byte in the buffer is \
1067	correct and that it requires a larger number of bytes than there \
1068	are in the input buffer. */ \
1069	wint_t ch = **inptrp; \
1070	size_t cnt, r; \
1071	\
1072	state->__count = inend - *inptrp; \
1073	\
1074	assert (ch != 0xc0 && ch != 0xc1); \
1075	if (ch >= 0xc2 && ch < 0xe0) \
1076	{ \
1077	/* We expect two bytes. The first byte cannot be 0xc0 or \
1078	0xc1, otherwise the wide character could have been \
1079	represented using a single byte. */ \
1080	cnt = 2; \
1081	ch &= 0x1f; \
1082	} \
1083	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
1084	{ \
1085	/* We expect three bytes. */ \
1086	cnt = 3; \
1087	ch &= 0x0f; \
1088	} \
1089	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
1090	{ \
1091	/* We expect four bytes. */ \
1092	cnt = 4; \
1093	ch &= 0x07; \
1094	} \
1095	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
1096	{ \
1097	/* We expect five bytes. */ \
1098	cnt = 5; \
1099	ch &= 0x03; \
1100	} \
1101	else \
1102	{ \
1103	/* We expect six bytes. */ \
1104	cnt = 6; \
1105	ch &= 0x01; \
1106	} \
1107	\
1108	/* The first byte is already consumed. */ \
1109	r = cnt - 1; \
1110	while (++(*inptrp) < inend) \
1111	{ \
1112	ch <<= 6; \
1113	ch \|= **inptrp & 0x3f; \
1114	--r; \
1115	} \
1116	\
1117	/* Shift for the so far missing bytes. */ \
1118	ch <<= r * 6; \
1119	\
1120	/* Store the number of bytes expected for the entire sequence. */ \
1121	state->__count \|= cnt << 8; \
1122	\
1123	/* Store the value. */ \
1124	state->__value.__wch = ch; \
1125	}
1126
1127	#define UNPACK_BYTES \
1128	{ \
1129	static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
1130	wint_t wch = state->__value.__wch; \
1131	size_t ntotal = state->__count >> 8; \
1132	\
1133	inlen = state->__count & 255; \
1134	\
1135	bytebuf[0] = inmask[ntotal - 2]; \
1136	\
1137	do \
1138	{ \
1139	if (--ntotal < inlen) \
1140	bytebuf[ntotal] = 0x80 \| (wch & 0x3f); \
1141	wch >>= 6; \
1142	} \
1143	while (ntotal > 1); \
1144	\
1145	bytebuf[0] \|= wch; \
1146	}
1147
1148	#define CLEAR_STATE \
1149	state->__count = 0
1150
1151
1152	#include <iconv/loop.c>
1153	#include <iconv/skeleton.c>
1154
1155
1156	/ Convert from UCS2 to the internal (UCS4-like) format. /
1157	#define DEFINE_INIT 0
1158	#define DEFINE_FINI 0
1159	#define MIN_NEEDED_FROM 2
1160	#define MIN_NEEDED_TO 4
1161	#define FROM_DIRECTION 1
1162	#define FROM_LOOP ucs2_internal_loop
1163	#define TO_LOOP ucs2_internal_loop /* This is not used. */
1164	#define FUNCTION_NAME __gconv_transform_ucs2_internal
1165	#define ONE_DIRECTION 1
1166
1167	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1168	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1169	#define LOOPFCT FROM_LOOP
1170	#define BODY \
1171	{ \
1172	uint16_t u1 = get16 (inptr); \
1173	\
1174	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1175	{ \
1176	/* Surrogate characters in UCS-2 input are not valid. Reject \
1177	them. (Catching this here is not security relevant.) */ \
1178	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
1179	} \
1180	\
1181	((uint32_t ) outptr) = u1; \
1182	outptr += sizeof (uint32_t); \
1183	inptr += 2; \
1184	}
1185	#define LOOP_NEED_FLAGS
1186	#include <iconv/loop.c>
1187	#include <iconv/skeleton.c>
1188
1189
1190	/ Convert from the internal (UCS4-like) format to UCS2. /
1191	#define DEFINE_INIT 0
1192	#define DEFINE_FINI 0
1193	#define MIN_NEEDED_FROM 4
1194	#define MIN_NEEDED_TO 2
1195	#define FROM_DIRECTION 1
1196	#define FROM_LOOP internal_ucs2_loop
1197	#define TO_LOOP internal_ucs2_loop /* This is not used. */
1198	#define FUNCTION_NAME __gconv_transform_internal_ucs2
1199	#define ONE_DIRECTION 1
1200
1201	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1202	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1203	#define LOOPFCT FROM_LOOP
1204	#define BODY \
1205	{ \
1206	uint32_t val = ((const uint32_t ) inptr); \
1207	\
1208	if (__glibc_unlikely (val >= 0x10000)) \
1209	{ \
1210	UNICODE_TAG_HANDLER (val, 4); \
1211	STANDARD_TO_LOOP_ERR_HANDLER (4); \
1212	} \
1213	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1214	{ \
1215	/* Surrogate characters in UCS-4 input are not valid. \
1216	We must catch this, because the UCS-2 output might be \
1217	interpreted as UTF-16 by other programs. If we let \
1218	surrogates pass through, attackers could make a security \
1219	hole exploit by synthesizing any desired plane 1-16 \
1220	character. */ \
1221	result = __GCONV_ILLEGAL_INPUT; \
1222	if (! ignore_errors_p ()) \
1223	break; \
1224	inptr += 4; \
1225	++*irreversible; \
1226	continue; \
1227	} \
1228	else \
1229	{ \
1230	put16 (outptr, val); \
1231	outptr += sizeof (uint16_t); \
1232	inptr += 4; \
1233	} \
1234	}
1235	#define LOOP_NEED_FLAGS
1236	#include <iconv/loop.c>
1237	#include <iconv/skeleton.c>
1238
1239
1240	/ Convert from UCS2 in other endianness to the internal (UCS4-like) format. /
1241	#define DEFINE_INIT 0
1242	#define DEFINE_FINI 0
1243	#define MIN_NEEDED_FROM 2
1244	#define MIN_NEEDED_TO 4
1245	#define FROM_DIRECTION 1
1246	#define FROM_LOOP ucs2reverse_internal_loop
1247	#define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
1248	#define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
1249	#define ONE_DIRECTION 1
1250
1251	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1252	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1253	#define LOOPFCT FROM_LOOP
1254	#define BODY \
1255	{ \
1256	uint16_t u1 = bswap_16 (get16 (inptr)); \
1257	\
1258	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1259	{ \
1260	/* Surrogate characters in UCS-2 input are not valid. Reject \
1261	them. (Catching this here is not security relevant.) */ \
1262	if (! ignore_errors_p ()) \
1263	{ \
1264	result = __GCONV_ILLEGAL_INPUT; \
1265	break; \
1266	} \
1267	inptr += 2; \
1268	++*irreversible; \
1269	continue; \
1270	} \
1271	\
1272	((uint32_t ) outptr) = u1; \
1273	outptr += sizeof (uint32_t); \
1274	inptr += 2; \
1275	}
1276	#define LOOP_NEED_FLAGS
1277	#include <iconv/loop.c>
1278	#include <iconv/skeleton.c>
1279
1280
1281	/ Convert from the internal (UCS4-like) format to UCS2 in other endianness. /
1282	#define DEFINE_INIT 0
1283	#define DEFINE_FINI 0
1284	#define MIN_NEEDED_FROM 4
1285	#define MIN_NEEDED_TO 2
1286	#define FROM_DIRECTION 1
1287	#define FROM_LOOP internal_ucs2reverse_loop
1288	#define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
1289	#define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
1290	#define ONE_DIRECTION 1
1291
1292	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1293	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1294	#define LOOPFCT FROM_LOOP
1295	#define BODY \
1296	{ \
1297	uint32_t val = ((const uint32_t ) inptr); \
1298	if (__glibc_unlikely (val >= 0x10000)) \
1299	{ \
1300	UNICODE_TAG_HANDLER (val, 4); \
1301	STANDARD_TO_LOOP_ERR_HANDLER (4); \
1302	} \
1303	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1304	{ \
1305	/* Surrogate characters in UCS-4 input are not valid. \
1306	We must catch this, because the UCS-2 output might be \
1307	interpreted as UTF-16 by other programs. If we let \
1308	surrogates pass through, attackers could make a security \
1309	hole exploit by synthesizing any desired plane 1-16 \
1310	character. */ \
1311	if (! ignore_errors_p ()) \
1312	{ \
1313	result = __GCONV_ILLEGAL_INPUT; \
1314	break; \
1315	} \
1316	inptr += 4; \
1317	++*irreversible; \
1318	continue; \
1319	} \
1320	else \
1321	{ \
1322	put16 (outptr, bswap_16 (val)); \
1323	outptr += sizeof (uint16_t); \
1324	inptr += 4; \
1325	} \
1326	}
1327	#define LOOP_NEED_FLAGS
1328	#include <iconv/loop.c>
1329	#include <iconv/skeleton.c>
1330

Browse the source code of glibc/iconv/gconv_simple.c