gconv_simple.c source code [glibc/iconv/gconv_simple.c]

1	/ Simple transformations functions.*
2	Copyright (C) 1997-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <byteswap.h>
20	#include <dlfcn.h>
21	#include <endian.h>
22	#include <errno.h>
23	#include <gconv.h>
24	#include <stdint.h>
25	#include <stdlib.h>
26	#include <string.h>
27	#include <wchar.h>
28	#include <sys/param.h>
29	#include <gconv_int.h>
30
31	#define BUILTIN_ALIAS(s1, s2) /* nothing */
32	#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
33	MinF, MaxF, MinT, MaxT) \
34	extern int Fct (struct __gconv_step , struct __gconv_step_data , \
35	const unsigned char *, const unsigned char , \
36	unsigned char *, size_t , int, int);
37	#include "gconv_builtin.h"
38
39
40	#ifndef EILSEQ
41	# define EILSEQ EINVAL
42	#endif
43
44
45	/ Specialized conversion function for a single byte to INTERNAL, recognizing*
46	only ASCII characters. /*
47	wint_t
48	__gconv_btwoc_ascii (struct __gconv_step step, unsigned* char c)
49	{
50	if (c < `0x80`)
51	return c;
52	else
53	return WEOF;
54	}
55
56
57	/ Transform from the internal, UCS4-like format, to UCS4. The*
58	difference between the internal ucs4 format and the real UCS4
59	format is, if any, the endianess. The Unicode/ISO 10646 says that
60	unless some higher protocol specifies it differently, the byte
61	order is big endian./*
62	#define DEFINE_INIT 0
63	#define DEFINE_FINI 0
64	#define MIN_NEEDED_FROM 4
65	#define MIN_NEEDED_TO 4
66	#define FROM_DIRECTION 1
67	#define FROM_LOOP internal_ucs4_loop
68	#define TO_LOOP internal_ucs4_loop /* This is not used. */
69	#define FUNCTION_NAME __gconv_transform_internal_ucs4
70	#define ONE_DIRECTION 0
71
72
73	static inline int
74	__attribute ((always_inline))
75	internal_ucs4_loop (struct __gconv_step *step,
76	struct __gconv_step_data *step_data,
77	const unsigned char *inptrp, const* unsigned char *inend,
78	unsigned char *outptrp, const* unsigned char *outend,
79	size_t *irreversible)
80	{
81	const unsigned char inptr = inptrp;
82	unsigned char outptr = outptrp;
83	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
84	int result;
85
86	#if __BYTE_ORDER == __LITTLE_ENDIAN
87	/ Sigh, we have to do some real work. /
88	size_t cnt;
89	uint32_t outptr32 = (uint32_t ) outptr;
90
91	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
92	outptr32++ = bswap_32 ((const uint32_t *) inptr);
93
94	*inptrp = inptr;
95	outptrp = (unsigned* char *) outptr32;
96	#elif __BYTE_ORDER == __BIG_ENDIAN
97	/ Simply copy the data. /
98	inptrp = inptr + n_convert `4`;
99	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
100	#else
101	# error "This endianess is not supported."
102	#endif
103
104	/ Determine the status. /
105	if (*inptrp == inend)
106	result = __GCONV_EMPTY_INPUT;
107	else if (*outptrp + `4` > outend)
108	result = __GCONV_FULL_OUTPUT;
109	else
110	result = __GCONV_INCOMPLETE_INPUT;
111
112	return result;
113	}
114
115	#if !_STRING_ARCH_unaligned
116	static inline int
117	__attribute ((always_inline))
118	internal_ucs4_loop_unaligned (struct __gconv_step *step,
119	struct __gconv_step_data *step_data,
120	const unsigned char **inptrp,
121	const unsigned char *inend,
122	unsigned char **outptrp,
123	const unsigned char *outend,
124	size_t *irreversible)
125	{
126	const unsigned char inptr = inptrp;
127	unsigned char outptr = outptrp;
128	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
129	int result;
130
131	# if __BYTE_ORDER == __LITTLE_ENDIAN
132	/ Sigh, we have to do some real work. /
133	size_t cnt;
134
135	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
136	{
137	outptr[`0`] = inptr[`3`];
138	outptr[`1`] = inptr[`2`];
139	outptr[`2`] = inptr[`1`];
140	outptr[`3`] = inptr[`0`];
141	}
142
143	*inptrp = inptr;
144	*outptrp = outptr;
145	# elif __BYTE_ORDER == __BIG_ENDIAN
146	/ Simply copy the data. /
147	inptrp = inptr + n_convert `4`;
148	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
149	# else
150	# error "This endianess is not supported."
151	# endif
152
153	/ Determine the status. /
154	if (*inptrp == inend)
155	result = __GCONV_EMPTY_INPUT;
156	else if (*outptrp + `4` > outend)
157	result = __GCONV_FULL_OUTPUT;
158	else
159	result = __GCONV_INCOMPLETE_INPUT;
160
161	return result;
162	}
163	#endif
164
165
166	static inline int
167	__attribute ((always_inline))
168	internal_ucs4_loop_single (struct __gconv_step *step,
169	struct __gconv_step_data *step_data,
170	const unsigned char **inptrp,
171	const unsigned char *inend,
172	unsigned char **outptrp,
173	const unsigned char *outend,
174	size_t *irreversible)
175	{
176	mbstate_t *state = step_data->__statep;
177	size_t cnt = state->__count & `7`;
178
179	while (*inptrp < inend && cnt < `4`)
180	state->__value.__wchb[cnt++] = (inptrp)++;
181
182	if (__glibc_unlikely (cnt < `4`))
183	{
184	/ Still not enough bytes. Store the ones in the input buffer. /
185	state->__count &= ~`7`;
186	state->__count \|= cnt;
187
188	return __GCONV_INCOMPLETE_INPUT;
189	}
190
191	#if __BYTE_ORDER == __LITTLE_ENDIAN
192	(*outptrp)[`0`] = state->__value.__wchb[`3`];
193	(*outptrp)[`1`] = state->__value.__wchb[`2`];
194	(*outptrp)[`2`] = state->__value.__wchb[`1`];
195	(*outptrp)[`3`] = state->__value.__wchb[`0`];
196
197	#elif __BYTE_ORDER == __BIG_ENDIAN
198	/ XXX unaligned /
199	(*outptrp)[`0`] = state->__value.__wchb[`0`];
200	(*outptrp)[`1`] = state->__value.__wchb[`1`];
201	(*outptrp)[`2`] = state->__value.__wchb[`2`];
202	(*outptrp)[`3`] = state->__value.__wchb[`3`];
203	#else
204	# error "This endianess is not supported."
205	#endif
206	*outptrp += `4`;
207
208	/ Clear the state buffer. /
209	state->__count &= ~`7`;
210
211	return __GCONV_OK;
212	}
213
214	#include <iconv/skeleton.c>
215
216
217	/ Transform from UCS4 to the internal, UCS4-like format. Unlike*
218	for the other direction we have to check for correct values here. /*
219	#define DEFINE_INIT 0
220	#define DEFINE_FINI 0
221	#define MIN_NEEDED_FROM 4
222	#define MIN_NEEDED_TO 4
223	#define FROM_DIRECTION 1
224	#define FROM_LOOP ucs4_internal_loop
225	#define TO_LOOP ucs4_internal_loop /* This is not used. */
226	#define FUNCTION_NAME __gconv_transform_ucs4_internal
227	#define ONE_DIRECTION 0
228
229
230	static inline int
231	__attribute ((always_inline))
232	ucs4_internal_loop (struct __gconv_step *step,
233	struct __gconv_step_data *step_data,
234	const unsigned char *inptrp, const* unsigned char *inend,
235	unsigned char *outptrp, const* unsigned char *outend,
236	size_t *irreversible)
237	{
238	int flags = step_data->__flags;
239	const unsigned char inptr = inptrp;
240	unsigned char outptr = outptrp;
241	int result;
242
243	for (; inptr + `4` <= inend && outptr + `4` <= outend; inptr += `4`)
244	{
245	uint32_t inval;
246
247	#if __BYTE_ORDER == __LITTLE_ENDIAN
248	inval = bswap_32 ((const* uint32_t *) inptr);
249	#else
250	inval = (const* uint32_t *) inptr;
251	#endif
252
253	if (__glibc_unlikely (inval > `0x7fffffff`))
254	{
255	/ The value is too large. We don't try transliteration here since*
256	this is not an error because of the lack of possibilities to
257	represent the result. This is a genuine bug in the input since
258	UCS4 does not allow such values. /*
259	if (irreversible == NULL)
260	/ We are transliterating, don't try to correct anything. /
261	return __GCONV_ILLEGAL_INPUT;
262
263	if (flags & __GCONV_IGNORE_ERRORS)
264	{
265	/ Just ignore this character. /
266	++*irreversible;
267	continue;
268	}
269
270	*inptrp = inptr;
271	*outptrp = outptr;
272	return __GCONV_ILLEGAL_INPUT;
273	}
274
275	((uint32_t ) outptr) = inval;
276	outptr += sizeof (uint32_t);
277	}
278
279	*inptrp = inptr;
280	*outptrp = outptr;
281
282	/ Determine the status. /
283	if (*inptrp == inend)
284	result = __GCONV_EMPTY_INPUT;
285	else if (*outptrp + `4` > outend)
286	result = __GCONV_FULL_OUTPUT;
287	else
288	result = __GCONV_INCOMPLETE_INPUT;
289
290	return result;
291	}
292
293	#if !_STRING_ARCH_unaligned
294	static inline int
295	__attribute ((always_inline))
296	ucs4_internal_loop_unaligned (struct __gconv_step *step,
297	struct __gconv_step_data *step_data,
298	const unsigned char **inptrp,
299	const unsigned char *inend,
300	unsigned char **outptrp,
301	const unsigned char *outend,
302	size_t *irreversible)
303	{
304	int flags = step_data->__flags;
305	const unsigned char inptr = inptrp;
306	unsigned char outptr = outptrp;
307	int result;
308
309	for (; inptr + `4` <= inend && outptr + `4` <= outend; inptr += `4`)
310	{
311	if (__glibc_unlikely (inptr[`0`] > `0x80`))
312	{
313	/ The value is too large. We don't try transliteration here since*
314	this is not an error because of the lack of possibilities to
315	represent the result. This is a genuine bug in the input since
316	UCS4 does not allow such values. /*
317	if (irreversible == NULL)
318	/ We are transliterating, don't try to correct anything. /
319	return __GCONV_ILLEGAL_INPUT;
320
321	if (flags & __GCONV_IGNORE_ERRORS)
322	{
323	/ Just ignore this character. /
324	++*irreversible;
325	continue;
326	}
327
328	*inptrp = inptr;
329	*outptrp = outptr;
330	return __GCONV_ILLEGAL_INPUT;
331	}
332
333	# if __BYTE_ORDER == __LITTLE_ENDIAN
334	outptr[`3`] = inptr[`0`];
335	outptr[`2`] = inptr[`1`];
336	outptr[`1`] = inptr[`2`];
337	outptr[`0`] = inptr[`3`];
338	# else
339	outptr[`0`] = inptr[`0`];
340	outptr[`1`] = inptr[`1`];
341	outptr[`2`] = inptr[`2`];
342	outptr[`3`] = inptr[`3`];
343	# endif
344	outptr += `4`;
345	}
346
347	*inptrp = inptr;
348	*outptrp = outptr;
349
350	/ Determine the status. /
351	if (*inptrp == inend)
352	result = __GCONV_EMPTY_INPUT;
353	else if (*outptrp + `4` > outend)
354	result = __GCONV_FULL_OUTPUT;
355	else
356	result = __GCONV_INCOMPLETE_INPUT;
357
358	return result;
359	}
360	#endif
361
362
363	static inline int
364	__attribute ((always_inline))
365	ucs4_internal_loop_single (struct __gconv_step *step,
366	struct __gconv_step_data *step_data,
367	const unsigned char **inptrp,
368	const unsigned char *inend,
369	unsigned char **outptrp,
370	const unsigned char *outend,
371	size_t *irreversible)
372	{
373	mbstate_t *state = step_data->__statep;
374	int flags = step_data->__flags;
375	size_t cnt = state->__count & `7`;
376
377	while (*inptrp < inend && cnt < `4`)
378	state->__value.__wchb[cnt++] = (inptrp)++;
379
380	if (__glibc_unlikely (cnt < `4`))
381	{
382	/ Still not enough bytes. Store the ones in the input buffer. /
383	state->__count &= ~`7`;
384	state->__count \|= cnt;
385
386	return __GCONV_INCOMPLETE_INPUT;
387	}
388
389	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`0`] > `0x80`,
390	`0`))
391	{
392	/ The value is too large. We don't try transliteration here since*
393	this is not an error because of the lack of possibilities to
394	represent the result. This is a genuine bug in the input since
395	UCS4 does not allow such values. /*
396	if (!(flags & __GCONV_IGNORE_ERRORS))
397	{
398	*inptrp -= cnt - (state->__count & `7`);
399	return __GCONV_ILLEGAL_INPUT;
400	}
401	}
402	else
403	{
404	#if __BYTE_ORDER == __LITTLE_ENDIAN
405	(*outptrp)[`0`] = state->__value.__wchb[`3`];
406	(*outptrp)[`1`] = state->__value.__wchb[`2`];
407	(*outptrp)[`2`] = state->__value.__wchb[`1`];
408	(*outptrp)[`3`] = state->__value.__wchb[`0`];
409	#elif __BYTE_ORDER == __BIG_ENDIAN
410	(*outptrp)[`0`] = state->__value.__wchb[`0`];
411	(*outptrp)[`1`] = state->__value.__wchb[`1`];
412	(*outptrp)[`2`] = state->__value.__wchb[`2`];
413	(*outptrp)[`3`] = state->__value.__wchb[`3`];
414	#endif
415
416	*outptrp += `4`;
417	}
418
419	/ Clear the state buffer. /
420	state->__count &= ~`7`;
421
422	return __GCONV_OK;
423	}
424
425	#include <iconv/skeleton.c>
426
427
428	/ Similarly for the little endian form. /
429	#define DEFINE_INIT 0
430	#define DEFINE_FINI 0
431	#define MIN_NEEDED_FROM 4
432	#define MIN_NEEDED_TO 4
433	#define FROM_DIRECTION 1
434	#define FROM_LOOP internal_ucs4le_loop
435	#define TO_LOOP internal_ucs4le_loop /* This is not used. */
436	#define FUNCTION_NAME __gconv_transform_internal_ucs4le
437	#define ONE_DIRECTION 0
438
439
440	static inline int
441	__attribute ((always_inline))
442	internal_ucs4le_loop (struct __gconv_step *step,
443	struct __gconv_step_data *step_data,
444	const unsigned char *inptrp, const* unsigned char *inend,
445	unsigned char *outptrp, const* unsigned char *outend,
446	size_t *irreversible)
447	{
448	const unsigned char inptr = inptrp;
449	unsigned char outptr = outptrp;
450	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
451	int result;
452
453	#if __BYTE_ORDER == __BIG_ENDIAN
454	/ Sigh, we have to do some real work. /
455	size_t cnt;
456	uint32_t outptr32 = (uint32_t ) outptr;
457
458	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`)
459	outptr32++ = bswap_32 ((const uint32_t *) inptr);
460	outptr = (unsigned char *) outptr32;
461
462	*inptrp = inptr;
463	*outptrp = outptr;
464	#elif __BYTE_ORDER == __LITTLE_ENDIAN
465	/ Simply copy the data. /
466	inptrp = inptr + n_convert `4`;
467	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
468	#else
469	# error "This endianess is not supported."
470	#endif
471
472	/ Determine the status. /
473	if (*inptrp == inend)
474	result = __GCONV_EMPTY_INPUT;
475	else if (*outptrp + `4` > outend)
476	result = __GCONV_FULL_OUTPUT;
477	else
478	result = __GCONV_INCOMPLETE_INPUT;
479
480	return result;
481	}
482
483	#if !_STRING_ARCH_unaligned
484	static inline int
485	__attribute ((always_inline))
486	internal_ucs4le_loop_unaligned (struct __gconv_step *step,
487	struct __gconv_step_data *step_data,
488	const unsigned char **inptrp,
489	const unsigned char *inend,
490	unsigned char **outptrp,
491	const unsigned char *outend,
492	size_t *irreversible)
493	{
494	const unsigned char inptr = inptrp;
495	unsigned char outptr = outptrp;
496	size_t n_convert = MIN (inend - inptr, outend - outptr) / `4`;
497	int result;
498
499	# if __BYTE_ORDER == __BIG_ENDIAN
500	/ Sigh, we have to do some real work. /
501	size_t cnt;
502
503	for (cnt = `0`; cnt < n_convert; ++cnt, inptr += `4`, outptr += `4`)
504	{
505	outptr[`0`] = inptr[`3`];
506	outptr[`1`] = inptr[`2`];
507	outptr[`2`] = inptr[`1`];
508	outptr[`3`] = inptr[`0`];
509	}
510
511	*inptrp = inptr;
512	*outptrp = outptr;
513	# elif __BYTE_ORDER == __LITTLE_ENDIAN
514	/ Simply copy the data. /
515	inptrp = inptr + n_convert `4`;
516	outptrp = __mempcpy (outptr, inptr, n_convert `4`);
517	# else
518	# error "This endianess is not supported."
519	# endif
520
521	/ Determine the status. /
522	if (*inptrp == inend)
523	result = __GCONV_EMPTY_INPUT;
524	else if (*inptrp + `4` > inend)
525	result = __GCONV_INCOMPLETE_INPUT;
526	else
527	{
528	assert (*outptrp + `4` > outend);
529	result = __GCONV_FULL_OUTPUT;
530	}
531
532	return result;
533	}
534	#endif
535
536
537	static inline int
538	__attribute ((always_inline))
539	internal_ucs4le_loop_single (struct __gconv_step *step,
540	struct __gconv_step_data *step_data,
541	const unsigned char **inptrp,
542	const unsigned char *inend,
543	unsigned char **outptrp,
544	const unsigned char *outend,
545	size_t *irreversible)
546	{
547	mbstate_t *state = step_data->__statep;
548	size_t cnt = state->__count & `7`;
549
550	while (*inptrp < inend && cnt < `4`)
551	state->__value.__wchb[cnt++] = (inptrp)++;
552
553	if (__glibc_unlikely (cnt < `4`))
554	{
555	/ Still not enough bytes. Store the ones in the input buffer. /
556	state->__count &= ~`7`;
557	state->__count \|= cnt;
558
559	return __GCONV_INCOMPLETE_INPUT;
560	}
561
562	#if __BYTE_ORDER == __BIG_ENDIAN
563	(*outptrp)[`0`] = state->__value.__wchb[`3`];
564	(*outptrp)[`1`] = state->__value.__wchb[`2`];
565	(*outptrp)[`2`] = state->__value.__wchb[`1`];
566	(*outptrp)[`3`] = state->__value.__wchb[`0`];
567
568	#else
569	/ XXX unaligned /
570	(*outptrp)[`0`] = state->__value.__wchb[`0`];
571	(*outptrp)[`1`] = state->__value.__wchb[`1`];
572	(*outptrp)[`2`] = state->__value.__wchb[`2`];
573	(*outptrp)[`3`] = state->__value.__wchb[`3`];
574
575	#endif
576
577	*outptrp += `4`;
578
579	/ Clear the state buffer. /
580	state->__count &= ~`7`;
581
582	return __GCONV_OK;
583	}
584
585	#include <iconv/skeleton.c>
586
587
588	/ And finally from UCS4-LE to the internal encoding. /
589	#define DEFINE_INIT 0
590	#define DEFINE_FINI 0
591	#define MIN_NEEDED_FROM 4
592	#define MIN_NEEDED_TO 4
593	#define FROM_DIRECTION 1
594	#define FROM_LOOP ucs4le_internal_loop
595	#define TO_LOOP ucs4le_internal_loop /* This is not used. */
596	#define FUNCTION_NAME __gconv_transform_ucs4le_internal
597	#define ONE_DIRECTION 0
598
599
600	static inline int
601	__attribute ((always_inline))
602	ucs4le_internal_loop (struct __gconv_step *step,
603	struct __gconv_step_data *step_data,
604	const unsigned char *inptrp, const* unsigned char *inend,
605	unsigned char *outptrp, const* unsigned char *outend,
606	size_t *irreversible)
607	{
608	int flags = step_data->__flags;
609	const unsigned char inptr = inptrp;
610	unsigned char outptr = outptrp;
611	int result;
612
613	for (; inptr + `4` <= inend && outptr + `4` <= outend; inptr += `4`)
614	{
615	uint32_t inval;
616
617	#if __BYTE_ORDER == __BIG_ENDIAN
618	inval = bswap_32 ((const* uint32_t *) inptr);
619	#else
620	inval = (const* uint32_t *) inptr;
621	#endif
622
623	if (__glibc_unlikely (inval > `0x7fffffff`))
624	{
625	/ The value is too large. We don't try transliteration here since*
626	this is not an error because of the lack of possibilities to
627	represent the result. This is a genuine bug in the input since
628	UCS4 does not allow such values. /*
629	if (irreversible == NULL)
630	/ We are transliterating, don't try to correct anything. /
631	return __GCONV_ILLEGAL_INPUT;
632
633	if (flags & __GCONV_IGNORE_ERRORS)
634	{
635	/ Just ignore this character. /
636	++*irreversible;
637	continue;
638	}
639
640	*inptrp = inptr;
641	*outptrp = outptr;
642	return __GCONV_ILLEGAL_INPUT;
643	}
644
645	((uint32_t ) outptr) = inval;
646	outptr += sizeof (uint32_t);
647	}
648
649	*inptrp = inptr;
650	*outptrp = outptr;
651
652	/ Determine the status. /
653	if (*inptrp == inend)
654	result = __GCONV_EMPTY_INPUT;
655	else if (*inptrp + `4` > inend)
656	result = __GCONV_INCOMPLETE_INPUT;
657	else
658	{
659	assert (*outptrp + `4` > outend);
660	result = __GCONV_FULL_OUTPUT;
661	}
662
663	return result;
664	}
665
666	#if !_STRING_ARCH_unaligned
667	static inline int
668	__attribute ((always_inline))
669	ucs4le_internal_loop_unaligned (struct __gconv_step *step,
670	struct __gconv_step_data *step_data,
671	const unsigned char **inptrp,
672	const unsigned char *inend,
673	unsigned char **outptrp,
674	const unsigned char *outend,
675	size_t *irreversible)
676	{
677	int flags = step_data->__flags;
678	const unsigned char inptr = inptrp;
679	unsigned char outptr = outptrp;
680	int result;
681
682	for (; inptr + `4` <= inend && outptr + `4` <= outend; inptr += `4`)
683	{
684	if (__glibc_unlikely (inptr[`3`] > `0x80`))
685	{
686	/ The value is too large. We don't try transliteration here since*
687	this is not an error because of the lack of possibilities to
688	represent the result. This is a genuine bug in the input since
689	UCS4 does not allow such values. /*
690	if (irreversible == NULL)
691	/ We are transliterating, don't try to correct anything. /
692	return __GCONV_ILLEGAL_INPUT;
693
694	if (flags & __GCONV_IGNORE_ERRORS)
695	{
696	/ Just ignore this character. /
697	++*irreversible;
698	continue;
699	}
700
701	*inptrp = inptr;
702	*outptrp = outptr;
703	return __GCONV_ILLEGAL_INPUT;
704	}
705
706	# if __BYTE_ORDER == __BIG_ENDIAN
707	outptr[`3`] = inptr[`0`];
708	outptr[`2`] = inptr[`1`];
709	outptr[`1`] = inptr[`2`];
710	outptr[`0`] = inptr[`3`];
711	# else
712	outptr[`0`] = inptr[`0`];
713	outptr[`1`] = inptr[`1`];
714	outptr[`2`] = inptr[`2`];
715	outptr[`3`] = inptr[`3`];
716	# endif
717
718	outptr += `4`;
719	}
720
721	*inptrp = inptr;
722	*outptrp = outptr;
723
724	/ Determine the status. /
725	if (*inptrp == inend)
726	result = __GCONV_EMPTY_INPUT;
727	else if (*inptrp + `4` > inend)
728	result = __GCONV_INCOMPLETE_INPUT;
729	else
730	{
731	assert (*outptrp + `4` > outend);
732	result = __GCONV_FULL_OUTPUT;
733	}
734
735	return result;
736	}
737	#endif
738
739
740	static inline int
741	__attribute ((always_inline))
742	ucs4le_internal_loop_single (struct __gconv_step *step,
743	struct __gconv_step_data *step_data,
744	const unsigned char **inptrp,
745	const unsigned char *inend,
746	unsigned char **outptrp,
747	const unsigned char *outend,
748	size_t *irreversible)
749	{
750	mbstate_t *state = step_data->__statep;
751	int flags = step_data->__flags;
752	size_t cnt = state->__count & `7`;
753
754	while (*inptrp < inend && cnt < `4`)
755	state->__value.__wchb[cnt++] = (inptrp)++;
756
757	if (__glibc_unlikely (cnt < `4`))
758	{
759	/ Still not enough bytes. Store the ones in the input buffer. /
760	state->__count &= ~`7`;
761	state->__count \|= cnt;
762
763	return __GCONV_INCOMPLETE_INPUT;
764	}
765
766	if (__builtin_expect (((unsigned char *) state->__value.__wchb)[`3`] > `0x80`,
767	`0`))
768	{
769	/ The value is too large. We don't try transliteration here since*
770	this is not an error because of the lack of possibilities to
771	represent the result. This is a genuine bug in the input since
772	UCS4 does not allow such values. /*
773	if (!(flags & __GCONV_IGNORE_ERRORS))
774	return __GCONV_ILLEGAL_INPUT;
775	}
776	else
777	{
778	#if __BYTE_ORDER == __BIG_ENDIAN
779	(*outptrp)[`0`] = state->__value.__wchb[`3`];
780	(*outptrp)[`1`] = state->__value.__wchb[`2`];
781	(*outptrp)[`2`] = state->__value.__wchb[`1`];
782	(*outptrp)[`3`] = state->__value.__wchb[`0`];
783	#else
784	(*outptrp)[`0`] = state->__value.__wchb[`0`];
785	(*outptrp)[`1`] = state->__value.__wchb[`1`];
786	(*outptrp)[`2`] = state->__value.__wchb[`2`];
787	(*outptrp)[`3`] = state->__value.__wchb[`3`];
788	#endif
789
790	*outptrp += `4`;
791	}
792
793	/ Clear the state buffer. /
794	state->__count &= ~`7`;
795
796	return __GCONV_OK;
797	}
798
799	#include <iconv/skeleton.c>
800
801
802	/ Convert from ISO 646-IRV to the internal (UCS4-like) format. /
803	#define DEFINE_INIT 0
804	#define DEFINE_FINI 0
805	#define MIN_NEEDED_FROM 1
806	#define MIN_NEEDED_TO 4
807	#define FROM_DIRECTION 1
808	#define FROM_LOOP ascii_internal_loop
809	#define TO_LOOP ascii_internal_loop /* This is not used. */
810	#define FUNCTION_NAME __gconv_transform_ascii_internal
811	#define ONE_DIRECTION 1
812
813	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
814	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
815	#define LOOPFCT FROM_LOOP
816	#define BODY \
817	{ \
818	if (__glibc_unlikely (*inptr > '\x7f')) \
819	{ \
820	/* The value is too large. We don't try transliteration here since \
821	this is not an error because of the lack of possibilities to \
822	represent the result. This is a genuine bug in the input since \
823	ASCII does not allow such values. */ \
824	STANDARD_FROM_LOOP_ERR_HANDLER (1); \
825	} \
826	else \
827	{ \
828	/* It's an one byte sequence. */ \
829	((uint32_t ) outptr) = *inptr++; \
830	outptr += sizeof (uint32_t); \
831	} \
832	}
833	#define LOOP_NEED_FLAGS
834	#include <iconv/loop.c>
835	#include <iconv/skeleton.c>
836
837
838	/ Convert from the internal (UCS4-like) format to ISO 646-IRV. /
839	#define DEFINE_INIT 0
840	#define DEFINE_FINI 0
841	#define MIN_NEEDED_FROM 4
842	#define MIN_NEEDED_TO 1
843	#define FROM_DIRECTION 1
844	#define FROM_LOOP internal_ascii_loop
845	#define TO_LOOP internal_ascii_loop /* This is not used. */
846	#define FUNCTION_NAME __gconv_transform_internal_ascii
847	#define ONE_DIRECTION 1
848
849	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
850	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
851	#define LOOPFCT FROM_LOOP
852	#define BODY \
853	{ \
854	if (__glibc_unlikely (((const uint32_t ) inptr) > 0x7f)) \
855	{ \
856	UNICODE_TAG_HANDLER (((const uint32_t ) inptr), 4); \
857	STANDARD_TO_LOOP_ERR_HANDLER (4); \
858	} \
859	else \
860	{ \
861	/* It's an one byte sequence. */ \
862	outptr++ = ((const uint32_t *) inptr); \
863	inptr += sizeof (uint32_t); \
864	} \
865	}
866	#define LOOP_NEED_FLAGS
867	#include <iconv/loop.c>
868	#include <iconv/skeleton.c>
869
870
871	/ Convert from the internal (UCS4-like) format to UTF-8. /
872	#define DEFINE_INIT 0
873	#define DEFINE_FINI 0
874	#define MIN_NEEDED_FROM 4
875	#define MIN_NEEDED_TO 1
876	#define MAX_NEEDED_TO 6
877	#define FROM_DIRECTION 1
878	#define FROM_LOOP internal_utf8_loop
879	#define TO_LOOP internal_utf8_loop /* This is not used. */
880	#define FUNCTION_NAME __gconv_transform_internal_utf8
881	#define ONE_DIRECTION 1
882
883	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
884	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
885	#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
886	#define LOOPFCT FROM_LOOP
887	#define BODY \
888	{ \
889	uint32_t wc = ((const uint32_t ) inptr); \
890	\
891	if (__glibc_likely (wc < 0x80)) \
892	/* It's an one byte sequence. */ \
893	*outptr++ = (unsigned char) wc; \
894	else if (__glibc_likely (wc <= 0x7fffffff \
895	&& (wc < 0xd800 \|\| wc > 0xdfff))) \
896	{ \
897	size_t step; \
898	unsigned char *start; \
899	\
900	for (step = 2; step < 6; ++step) \
901	if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
902	break; \
903	\
904	if (__glibc_unlikely (outptr + step > outend)) \
905	{ \
906	/* Too long. */ \
907	result = __GCONV_FULL_OUTPUT; \
908	break; \
909	} \
910	\
911	start = outptr; \
912	*outptr = (unsigned char) (~0xff >> step); \
913	outptr += step; \
914	do \
915	{ \
916	start[--step] = 0x80 \| (wc & 0x3f); \
917	wc >>= 6; \
918	} \
919	while (step > 1); \
920	start[0] \|= wc; \
921	} \
922	else \
923	{ \
924	STANDARD_TO_LOOP_ERR_HANDLER (4); \
925	} \
926	\
927	inptr += 4; \
928	}
929	#define LOOP_NEED_FLAGS
930	#include <iconv/loop.c>
931	#include <iconv/skeleton.c>
932
933
934	/ Convert from UTF-8 to the internal (UCS4-like) format. /
935	#define DEFINE_INIT 0
936	#define DEFINE_FINI 0
937	#define MIN_NEEDED_FROM 1
938	#define MAX_NEEDED_FROM 6
939	#define MIN_NEEDED_TO 4
940	#define FROM_DIRECTION 1
941	#define FROM_LOOP utf8_internal_loop
942	#define TO_LOOP utf8_internal_loop /* This is not used. */
943	#define FUNCTION_NAME __gconv_transform_utf8_internal
944	#define ONE_DIRECTION 1
945
946	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
947	#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
948	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
949	#define LOOPFCT FROM_LOOP
950	#define BODY \
951	{ \
952	/* Next input byte. */ \
953	uint32_t ch = *inptr; \
954	\
955	if (__glibc_likely (ch < 0x80)) \
956	{ \
957	/* One byte sequence. */ \
958	++inptr; \
959	} \
960	else \
961	{ \
962	unsigned int cnt; \
963	unsigned int i; \
964	\
965	if (ch >= 0xc2 && ch < 0xe0) \
966	{ \
967	/* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
968	otherwise the wide character could have been represented \
969	using a single byte. */ \
970	cnt = 2; \
971	ch &= 0x1f; \
972	} \
973	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
974	{ \
975	/* We expect three bytes. */ \
976	cnt = 3; \
977	ch &= 0x0f; \
978	} \
979	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
980	{ \
981	/* We expect four bytes. */ \
982	cnt = 4; \
983	ch &= 0x07; \
984	} \
985	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
986	{ \
987	/* We expect five bytes. */ \
988	cnt = 5; \
989	ch &= 0x03; \
990	} \
991	else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
992	{ \
993	/* We expect six bytes. */ \
994	cnt = 6; \
995	ch &= 0x01; \
996	} \
997	else \
998	{ \
999	/* Search the end of this ill-formed UTF-8 character. This \
1000	is the next byte with (x & 0xc0) != 0x80. */ \
1001	i = 0; \
1002	do \
1003	++i; \
1004	while (inptr + i < inend \
1005	&& (*(inptr + i) & 0xc0) == 0x80 \
1006	&& i < 5); \
1007	\
1008	errout: \
1009	STANDARD_FROM_LOOP_ERR_HANDLER (i); \
1010	} \
1011	\
1012	if (__glibc_unlikely (inptr + cnt > inend)) \
1013	{ \
1014	/* We don't have enough input. But before we report that check \
1015	that all the bytes are correct. */ \
1016	for (i = 1; inptr + i < inend; ++i) \
1017	if ((inptr[i] & 0xc0) != 0x80) \
1018	break; \
1019	\
1020	if (__glibc_likely (inptr + i == inend)) \
1021	{ \
1022	result = __GCONV_INCOMPLETE_INPUT; \
1023	break; \
1024	} \
1025	\
1026	goto errout; \
1027	} \
1028	\
1029	/* Read the possible remaining bytes. */ \
1030	for (i = 1; i < cnt; ++i) \
1031	{ \
1032	uint32_t byte = inptr[i]; \
1033	\
1034	if ((byte & 0xc0) != 0x80) \
1035	/* This is an illegal encoding. */ \
1036	break; \
1037	\
1038	ch <<= 6; \
1039	ch \|= byte & 0x3f; \
1040	} \
1041	\
1042	/* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
1043	If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
1044	have been represented with fewer than cnt bytes. */ \
1045	if (i < cnt \|\| (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
1046	/* Do not accept UTF-16 surrogates. */ \
1047	\|\| (ch >= 0xd800 && ch <= 0xdfff)) \
1048	{ \
1049	/* This is an illegal encoding. */ \
1050	goto errout; \
1051	} \
1052	\
1053	inptr += cnt; \
1054	} \
1055	\
1056	/* Now adjust the pointers and store the result. */ \
1057	((uint32_t ) outptr) = ch; \
1058	outptr += sizeof (uint32_t); \
1059	}
1060	#define LOOP_NEED_FLAGS
1061
1062	#define STORE_REST \
1063	{ \
1064	/* We store the remaining bytes while converting them into the UCS4 \
1065	format. We can assume that the first byte in the buffer is \
1066	correct and that it requires a larger number of bytes than there \
1067	are in the input buffer. */ \
1068	wint_t ch = **inptrp; \
1069	size_t cnt, r; \
1070	\
1071	state->__count = inend - *inptrp; \
1072	\
1073	assert (ch != 0xc0 && ch != 0xc1); \
1074	if (ch >= 0xc2 && ch < 0xe0) \
1075	{ \
1076	/* We expect two bytes. The first byte cannot be 0xc0 or \
1077	0xc1, otherwise the wide character could have been \
1078	represented using a single byte. */ \
1079	cnt = 2; \
1080	ch &= 0x1f; \
1081	} \
1082	else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
1083	{ \
1084	/* We expect three bytes. */ \
1085	cnt = 3; \
1086	ch &= 0x0f; \
1087	} \
1088	else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
1089	{ \
1090	/* We expect four bytes. */ \
1091	cnt = 4; \
1092	ch &= 0x07; \
1093	} \
1094	else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
1095	{ \
1096	/* We expect five bytes. */ \
1097	cnt = 5; \
1098	ch &= 0x03; \
1099	} \
1100	else \
1101	{ \
1102	/* We expect six bytes. */ \
1103	cnt = 6; \
1104	ch &= 0x01; \
1105	} \
1106	\
1107	/* The first byte is already consumed. */ \
1108	r = cnt - 1; \
1109	while (++(*inptrp) < inend) \
1110	{ \
1111	ch <<= 6; \
1112	ch \|= **inptrp & 0x3f; \
1113	--r; \
1114	} \
1115	\
1116	/* Shift for the so far missing bytes. */ \
1117	ch <<= r * 6; \
1118	\
1119	/* Store the number of bytes expected for the entire sequence. */ \
1120	state->__count \|= cnt << 8; \
1121	\
1122	/* Store the value. */ \
1123	state->__value.__wch = ch; \
1124	}
1125
1126	#define UNPACK_BYTES \
1127	{ \
1128	static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
1129	wint_t wch = state->__value.__wch; \
1130	size_t ntotal = state->__count >> 8; \
1131	\
1132	inlen = state->__count & 255; \
1133	\
1134	bytebuf[0] = inmask[ntotal - 2]; \
1135	\
1136	do \
1137	{ \
1138	if (--ntotal < inlen) \
1139	bytebuf[ntotal] = 0x80 \| (wch & 0x3f); \
1140	wch >>= 6; \
1141	} \
1142	while (ntotal > 1); \
1143	\
1144	bytebuf[0] \|= wch; \
1145	}
1146
1147	#define CLEAR_STATE \
1148	state->__count = 0
1149
1150
1151	#include <iconv/loop.c>
1152	#include <iconv/skeleton.c>
1153
1154
1155	/ Convert from UCS2 to the internal (UCS4-like) format. /
1156	#define DEFINE_INIT 0
1157	#define DEFINE_FINI 0
1158	#define MIN_NEEDED_FROM 2
1159	#define MIN_NEEDED_TO 4
1160	#define FROM_DIRECTION 1
1161	#define FROM_LOOP ucs2_internal_loop
1162	#define TO_LOOP ucs2_internal_loop /* This is not used. */
1163	#define FUNCTION_NAME __gconv_transform_ucs2_internal
1164	#define ONE_DIRECTION 1
1165
1166	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1167	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1168	#define LOOPFCT FROM_LOOP
1169	#define BODY \
1170	{ \
1171	uint16_t u1 = get16 (inptr); \
1172	\
1173	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1174	{ \
1175	/* Surrogate characters in UCS-2 input are not valid. Reject \
1176	them. (Catching this here is not security relevant.) */ \
1177	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
1178	} \
1179	\
1180	((uint32_t ) outptr) = u1; \
1181	outptr += sizeof (uint32_t); \
1182	inptr += 2; \
1183	}
1184	#define LOOP_NEED_FLAGS
1185	#include <iconv/loop.c>
1186	#include <iconv/skeleton.c>
1187
1188
1189	/ Convert from the internal (UCS4-like) format to UCS2. /
1190	#define DEFINE_INIT 0
1191	#define DEFINE_FINI 0
1192	#define MIN_NEEDED_FROM 4
1193	#define MIN_NEEDED_TO 2
1194	#define FROM_DIRECTION 1
1195	#define FROM_LOOP internal_ucs2_loop
1196	#define TO_LOOP internal_ucs2_loop /* This is not used. */
1197	#define FUNCTION_NAME __gconv_transform_internal_ucs2
1198	#define ONE_DIRECTION 1
1199
1200	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1201	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1202	#define LOOPFCT FROM_LOOP
1203	#define BODY \
1204	{ \
1205	uint32_t val = ((const uint32_t ) inptr); \
1206	\
1207	if (__glibc_unlikely (val >= 0x10000)) \
1208	{ \
1209	UNICODE_TAG_HANDLER (val, 4); \
1210	STANDARD_TO_LOOP_ERR_HANDLER (4); \
1211	} \
1212	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1213	{ \
1214	/* Surrogate characters in UCS-4 input are not valid. \
1215	We must catch this, because the UCS-2 output might be \
1216	interpreted as UTF-16 by other programs. If we let \
1217	surrogates pass through, attackers could make a security \
1218	hole exploit by synthesizing any desired plane 1-16 \
1219	character. */ \
1220	result = __GCONV_ILLEGAL_INPUT; \
1221	if (! ignore_errors_p ()) \
1222	break; \
1223	inptr += 4; \
1224	++*irreversible; \
1225	continue; \
1226	} \
1227	else \
1228	{ \
1229	put16 (outptr, val); \
1230	outptr += sizeof (uint16_t); \
1231	inptr += 4; \
1232	} \
1233	}
1234	#define LOOP_NEED_FLAGS
1235	#include <iconv/loop.c>
1236	#include <iconv/skeleton.c>
1237
1238
1239	/ Convert from UCS2 in other endianness to the internal (UCS4-like) format. /
1240	#define DEFINE_INIT 0
1241	#define DEFINE_FINI 0
1242	#define MIN_NEEDED_FROM 2
1243	#define MIN_NEEDED_TO 4
1244	#define FROM_DIRECTION 1
1245	#define FROM_LOOP ucs2reverse_internal_loop
1246	#define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
1247	#define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
1248	#define ONE_DIRECTION 1
1249
1250	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1251	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1252	#define LOOPFCT FROM_LOOP
1253	#define BODY \
1254	{ \
1255	uint16_t u1 = bswap_16 (get16 (inptr)); \
1256	\
1257	if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1258	{ \
1259	/* Surrogate characters in UCS-2 input are not valid. Reject \
1260	them. (Catching this here is not security relevant.) */ \
1261	if (! ignore_errors_p ()) \
1262	{ \
1263	result = __GCONV_ILLEGAL_INPUT; \
1264	break; \
1265	} \
1266	inptr += 2; \
1267	++*irreversible; \
1268	continue; \
1269	} \
1270	\
1271	((uint32_t ) outptr) = u1; \
1272	outptr += sizeof (uint32_t); \
1273	inptr += 2; \
1274	}
1275	#define LOOP_NEED_FLAGS
1276	#include <iconv/loop.c>
1277	#include <iconv/skeleton.c>
1278
1279
1280	/ Convert from the internal (UCS4-like) format to UCS2 in other endianness. /
1281	#define DEFINE_INIT 0
1282	#define DEFINE_FINI 0
1283	#define MIN_NEEDED_FROM 4
1284	#define MIN_NEEDED_TO 2
1285	#define FROM_DIRECTION 1
1286	#define FROM_LOOP internal_ucs2reverse_loop
1287	#define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
1288	#define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
1289	#define ONE_DIRECTION 1
1290
1291	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1292	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1293	#define LOOPFCT FROM_LOOP
1294	#define BODY \
1295	{ \
1296	uint32_t val = ((const uint32_t ) inptr); \
1297	if (__glibc_unlikely (val >= 0x10000)) \
1298	{ \
1299	UNICODE_TAG_HANDLER (val, 4); \
1300	STANDARD_TO_LOOP_ERR_HANDLER (4); \
1301	} \
1302	else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1303	{ \
1304	/* Surrogate characters in UCS-4 input are not valid. \
1305	We must catch this, because the UCS-2 output might be \
1306	interpreted as UTF-16 by other programs. If we let \
1307	surrogates pass through, attackers could make a security \
1308	hole exploit by synthesizing any desired plane 1-16 \
1309	character. */ \
1310	if (! ignore_errors_p ()) \
1311	{ \
1312	result = __GCONV_ILLEGAL_INPUT; \
1313	break; \
1314	} \
1315	inptr += 4; \
1316	++*irreversible; \
1317	continue; \
1318	} \
1319	else \
1320	{ \
1321	put16 (outptr, bswap_16 (val)); \
1322	outptr += sizeof (uint16_t); \
1323	inptr += 4; \
1324	} \
1325	}
1326	#define LOOP_NEED_FLAGS
1327	#include <iconv/loop.c>
1328	#include <iconv/skeleton.c>
1329

Browse the source code of glibc/iconv/gconv_simple.c