1 | /* Simple transformations functions. |
2 | Copyright (C) 1997-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <byteswap.h> |
20 | #include <dlfcn.h> |
21 | #include <endian.h> |
22 | #include <errno.h> |
23 | #include <gconv.h> |
24 | #include <stdint.h> |
25 | #include <stdlib.h> |
26 | #include <string.h> |
27 | #include <wchar.h> |
28 | #include <sys/param.h> |
29 | #include <gconv_int.h> |
30 | |
31 | #define BUILTIN_ALIAS(s1, s2) /* nothing */ |
32 | #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ |
33 | MinF, MaxF, MinT, MaxT) \ |
34 | extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \ |
35 | const unsigned char **, const unsigned char *, \ |
36 | unsigned char **, size_t *, int, int); |
37 | #include "gconv_builtin.h" |
38 | |
39 | |
40 | #ifndef EILSEQ |
41 | # define EILSEQ EINVAL |
42 | #endif |
43 | |
44 | |
45 | /* Specialized conversion function for a single byte to INTERNAL, recognizing |
46 | only ASCII characters. */ |
47 | wint_t |
48 | __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c) |
49 | { |
50 | if (c < 0x80) |
51 | return c; |
52 | else |
53 | return WEOF; |
54 | } |
55 | |
56 | |
57 | /* Transform from the internal, UCS4-like format, to UCS4. The |
58 | difference between the internal ucs4 format and the real UCS4 |
59 | format is, if any, the endianness. The Unicode/ISO 10646 says that |
60 | unless some higher protocol specifies it differently, the byte |
61 | order is big endian.*/ |
62 | #define DEFINE_INIT 0 |
63 | #define DEFINE_FINI 0 |
64 | #define MIN_NEEDED_FROM 4 |
65 | #define MIN_NEEDED_TO 4 |
66 | #define FROM_DIRECTION 1 |
67 | #define FROM_LOOP internal_ucs4_loop |
68 | #define TO_LOOP internal_ucs4_loop /* This is not used. */ |
69 | #define FUNCTION_NAME __gconv_transform_internal_ucs4 |
70 | #define ONE_DIRECTION 0 |
71 | |
72 | |
73 | static inline int |
74 | __attribute ((always_inline)) |
75 | internal_ucs4_loop (struct __gconv_step *step, |
76 | struct __gconv_step_data *step_data, |
77 | const unsigned char **inptrp, const unsigned char *inend, |
78 | unsigned char **outptrp, const unsigned char *outend, |
79 | size_t *irreversible) |
80 | { |
81 | const unsigned char *inptr = *inptrp; |
82 | unsigned char *outptr = *outptrp; |
83 | size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; |
84 | int result; |
85 | |
86 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
87 | /* Sigh, we have to do some real work. */ |
88 | size_t cnt; |
89 | |
90 | for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) |
91 | { |
92 | uint32_t val = get32 (inptr); |
93 | put32 (outptr, __builtin_bswap32 (val)); |
94 | } |
95 | |
96 | *inptrp = inptr; |
97 | *outptrp = outptr; |
98 | #elif __BYTE_ORDER == __BIG_ENDIAN |
99 | /* Simply copy the data. */ |
100 | *inptrp = inptr + n_convert * 4; |
101 | *outptrp = __mempcpy (outptr, inptr, n_convert * 4); |
102 | #else |
103 | # error "This endianness is not supported." |
104 | #endif |
105 | |
106 | /* Determine the status. */ |
107 | if (*inptrp == inend) |
108 | result = __GCONV_EMPTY_INPUT; |
109 | else if (*outptrp + 4 > outend) |
110 | result = __GCONV_FULL_OUTPUT; |
111 | else |
112 | result = __GCONV_INCOMPLETE_INPUT; |
113 | |
114 | return result; |
115 | } |
116 | |
117 | |
118 | static inline int |
119 | __attribute ((always_inline)) |
120 | internal_ucs4_loop_single (struct __gconv_step *step, |
121 | struct __gconv_step_data *step_data, |
122 | const unsigned char **inptrp, |
123 | const unsigned char *inend, |
124 | unsigned char **outptrp, |
125 | const unsigned char *outend, |
126 | size_t *irreversible) |
127 | { |
128 | mbstate_t *state = step_data->__statep; |
129 | size_t cnt = state->__count & 7; |
130 | |
131 | while (*inptrp < inend && cnt < 4) |
132 | state->__value.__wchb[cnt++] = *(*inptrp)++; |
133 | |
134 | if (__glibc_unlikely (cnt < 4)) |
135 | { |
136 | /* Still not enough bytes. Store the ones in the input buffer. */ |
137 | state->__count &= ~7; |
138 | state->__count |= cnt; |
139 | |
140 | return __GCONV_INCOMPLETE_INPUT; |
141 | } |
142 | |
143 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
144 | (*outptrp)[0] = state->__value.__wchb[3]; |
145 | (*outptrp)[1] = state->__value.__wchb[2]; |
146 | (*outptrp)[2] = state->__value.__wchb[1]; |
147 | (*outptrp)[3] = state->__value.__wchb[0]; |
148 | |
149 | #elif __BYTE_ORDER == __BIG_ENDIAN |
150 | /* XXX unaligned */ |
151 | (*outptrp)[0] = state->__value.__wchb[0]; |
152 | (*outptrp)[1] = state->__value.__wchb[1]; |
153 | (*outptrp)[2] = state->__value.__wchb[2]; |
154 | (*outptrp)[3] = state->__value.__wchb[3]; |
155 | #else |
156 | # error "This endianness is not supported." |
157 | #endif |
158 | *outptrp += 4; |
159 | |
160 | /* Clear the state buffer. */ |
161 | state->__count &= ~7; |
162 | |
163 | return __GCONV_OK; |
164 | } |
165 | |
166 | #include <iconv/skeleton.c> |
167 | |
168 | |
169 | /* Transform from UCS4 to the internal, UCS4-like format. Unlike |
170 | for the other direction we have to check for correct values here. */ |
171 | #define DEFINE_INIT 0 |
172 | #define DEFINE_FINI 0 |
173 | #define MIN_NEEDED_FROM 4 |
174 | #define MIN_NEEDED_TO 4 |
175 | #define FROM_DIRECTION 1 |
176 | #define FROM_LOOP ucs4_internal_loop |
177 | #define TO_LOOP ucs4_internal_loop /* This is not used. */ |
178 | #define FUNCTION_NAME __gconv_transform_ucs4_internal |
179 | #define ONE_DIRECTION 0 |
180 | |
181 | |
182 | static inline int |
183 | __attribute ((always_inline)) |
184 | ucs4_internal_loop (struct __gconv_step *step, |
185 | struct __gconv_step_data *step_data, |
186 | const unsigned char **inptrp, const unsigned char *inend, |
187 | unsigned char **outptrp, const unsigned char *outend, |
188 | size_t *irreversible) |
189 | { |
190 | int flags = step_data->__flags; |
191 | const unsigned char *inptr = *inptrp; |
192 | unsigned char *outptr = *outptrp; |
193 | int result; |
194 | |
195 | for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4) |
196 | { |
197 | uint32_t inval = get32 (inptr); |
198 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
199 | inval = __builtin_bswap32 (inval); |
200 | #endif |
201 | |
202 | if (__glibc_unlikely (inval > 0x7fffffff)) |
203 | { |
204 | /* The value is too large. We don't try transliteration here since |
205 | this is not an error because of the lack of possibilities to |
206 | represent the result. This is a genuine bug in the input since |
207 | UCS4 does not allow such values. */ |
208 | if (irreversible == NULL) |
209 | /* We are transliterating, don't try to correct anything. */ |
210 | return __GCONV_ILLEGAL_INPUT; |
211 | |
212 | if (flags & __GCONV_IGNORE_ERRORS) |
213 | { |
214 | /* Just ignore this character. */ |
215 | ++*irreversible; |
216 | continue; |
217 | } |
218 | |
219 | *inptrp = inptr; |
220 | *outptrp = outptr; |
221 | return __GCONV_ILLEGAL_INPUT; |
222 | } |
223 | |
224 | put32 (outptr, inval); |
225 | outptr += sizeof (uint32_t); |
226 | } |
227 | |
228 | *inptrp = inptr; |
229 | *outptrp = outptr; |
230 | |
231 | /* Determine the status. */ |
232 | if (*inptrp == inend) |
233 | result = __GCONV_EMPTY_INPUT; |
234 | else if (*outptrp + 4 > outend) |
235 | result = __GCONV_FULL_OUTPUT; |
236 | else |
237 | result = __GCONV_INCOMPLETE_INPUT; |
238 | |
239 | return result; |
240 | } |
241 | |
242 | |
243 | static inline int |
244 | __attribute ((always_inline)) |
245 | ucs4_internal_loop_single (struct __gconv_step *step, |
246 | struct __gconv_step_data *step_data, |
247 | const unsigned char **inptrp, |
248 | const unsigned char *inend, |
249 | unsigned char **outptrp, |
250 | const unsigned char *outend, |
251 | size_t *irreversible) |
252 | { |
253 | mbstate_t *state = step_data->__statep; |
254 | int flags = step_data->__flags; |
255 | size_t cnt = state->__count & 7; |
256 | |
257 | while (*inptrp < inend && cnt < 4) |
258 | state->__value.__wchb[cnt++] = *(*inptrp)++; |
259 | |
260 | if (__glibc_unlikely (cnt < 4)) |
261 | { |
262 | /* Still not enough bytes. Store the ones in the input buffer. */ |
263 | state->__count &= ~7; |
264 | state->__count |= cnt; |
265 | |
266 | return __GCONV_INCOMPLETE_INPUT; |
267 | } |
268 | |
269 | if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80, |
270 | 0)) |
271 | { |
272 | /* The value is too large. We don't try transliteration here since |
273 | this is not an error because of the lack of possibilities to |
274 | represent the result. This is a genuine bug in the input since |
275 | UCS4 does not allow such values. */ |
276 | if (!(flags & __GCONV_IGNORE_ERRORS)) |
277 | { |
278 | *inptrp -= cnt - (state->__count & 7); |
279 | return __GCONV_ILLEGAL_INPUT; |
280 | } |
281 | } |
282 | else |
283 | { |
284 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
285 | (*outptrp)[0] = state->__value.__wchb[3]; |
286 | (*outptrp)[1] = state->__value.__wchb[2]; |
287 | (*outptrp)[2] = state->__value.__wchb[1]; |
288 | (*outptrp)[3] = state->__value.__wchb[0]; |
289 | #elif __BYTE_ORDER == __BIG_ENDIAN |
290 | (*outptrp)[0] = state->__value.__wchb[0]; |
291 | (*outptrp)[1] = state->__value.__wchb[1]; |
292 | (*outptrp)[2] = state->__value.__wchb[2]; |
293 | (*outptrp)[3] = state->__value.__wchb[3]; |
294 | #endif |
295 | |
296 | *outptrp += 4; |
297 | } |
298 | |
299 | /* Clear the state buffer. */ |
300 | state->__count &= ~7; |
301 | |
302 | return __GCONV_OK; |
303 | } |
304 | |
305 | #include <iconv/skeleton.c> |
306 | |
307 | |
308 | /* Similarly for the little endian form. */ |
309 | #define DEFINE_INIT 0 |
310 | #define DEFINE_FINI 0 |
311 | #define MIN_NEEDED_FROM 4 |
312 | #define MIN_NEEDED_TO 4 |
313 | #define FROM_DIRECTION 1 |
314 | #define FROM_LOOP internal_ucs4le_loop |
315 | #define TO_LOOP internal_ucs4le_loop /* This is not used. */ |
316 | #define FUNCTION_NAME __gconv_transform_internal_ucs4le |
317 | #define ONE_DIRECTION 0 |
318 | |
319 | |
320 | static inline int |
321 | __attribute ((always_inline)) |
322 | internal_ucs4le_loop (struct __gconv_step *step, |
323 | struct __gconv_step_data *step_data, |
324 | const unsigned char **inptrp, const unsigned char *inend, |
325 | unsigned char **outptrp, const unsigned char *outend, |
326 | size_t *irreversible) |
327 | { |
328 | const unsigned char *inptr = *inptrp; |
329 | unsigned char *outptr = *outptrp; |
330 | size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; |
331 | int result; |
332 | |
333 | #if __BYTE_ORDER == __BIG_ENDIAN |
334 | /* Sigh, we have to do some real work. */ |
335 | size_t cnt; |
336 | |
337 | for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) |
338 | { |
339 | uint32_t val = get32 (inptr); |
340 | put32 (outptr, __builtin_bswap32 (val)); |
341 | } |
342 | |
343 | *inptrp = inptr; |
344 | *outptrp = outptr; |
345 | #elif __BYTE_ORDER == __LITTLE_ENDIAN |
346 | /* Simply copy the data. */ |
347 | *inptrp = inptr + n_convert * 4; |
348 | *outptrp = __mempcpy (outptr, inptr, n_convert * 4); |
349 | #else |
350 | # error "This endianness is not supported." |
351 | #endif |
352 | |
353 | /* Determine the status. */ |
354 | if (*inptrp == inend) |
355 | result = __GCONV_EMPTY_INPUT; |
356 | else if (*outptrp + 4 > outend) |
357 | result = __GCONV_FULL_OUTPUT; |
358 | else |
359 | result = __GCONV_INCOMPLETE_INPUT; |
360 | |
361 | return result; |
362 | } |
363 | |
364 | |
365 | static inline int |
366 | __attribute ((always_inline)) |
367 | internal_ucs4le_loop_single (struct __gconv_step *step, |
368 | struct __gconv_step_data *step_data, |
369 | const unsigned char **inptrp, |
370 | const unsigned char *inend, |
371 | unsigned char **outptrp, |
372 | const unsigned char *outend, |
373 | size_t *irreversible) |
374 | { |
375 | mbstate_t *state = step_data->__statep; |
376 | size_t cnt = state->__count & 7; |
377 | |
378 | while (*inptrp < inend && cnt < 4) |
379 | state->__value.__wchb[cnt++] = *(*inptrp)++; |
380 | |
381 | if (__glibc_unlikely (cnt < 4)) |
382 | { |
383 | /* Still not enough bytes. Store the ones in the input buffer. */ |
384 | state->__count &= ~7; |
385 | state->__count |= cnt; |
386 | |
387 | return __GCONV_INCOMPLETE_INPUT; |
388 | } |
389 | |
390 | #if __BYTE_ORDER == __BIG_ENDIAN |
391 | (*outptrp)[0] = state->__value.__wchb[3]; |
392 | (*outptrp)[1] = state->__value.__wchb[2]; |
393 | (*outptrp)[2] = state->__value.__wchb[1]; |
394 | (*outptrp)[3] = state->__value.__wchb[0]; |
395 | |
396 | #else |
397 | /* XXX unaligned */ |
398 | (*outptrp)[0] = state->__value.__wchb[0]; |
399 | (*outptrp)[1] = state->__value.__wchb[1]; |
400 | (*outptrp)[2] = state->__value.__wchb[2]; |
401 | (*outptrp)[3] = state->__value.__wchb[3]; |
402 | |
403 | #endif |
404 | |
405 | *outptrp += 4; |
406 | |
407 | /* Clear the state buffer. */ |
408 | state->__count &= ~7; |
409 | |
410 | return __GCONV_OK; |
411 | } |
412 | |
413 | #include <iconv/skeleton.c> |
414 | |
415 | |
416 | /* And finally from UCS4-LE to the internal encoding. */ |
417 | #define DEFINE_INIT 0 |
418 | #define DEFINE_FINI 0 |
419 | #define MIN_NEEDED_FROM 4 |
420 | #define MIN_NEEDED_TO 4 |
421 | #define FROM_DIRECTION 1 |
422 | #define FROM_LOOP ucs4le_internal_loop |
423 | #define TO_LOOP ucs4le_internal_loop /* This is not used. */ |
424 | #define FUNCTION_NAME __gconv_transform_ucs4le_internal |
425 | #define ONE_DIRECTION 0 |
426 | |
427 | |
428 | static inline int |
429 | __attribute ((always_inline)) |
430 | ucs4le_internal_loop (struct __gconv_step *step, |
431 | struct __gconv_step_data *step_data, |
432 | const unsigned char **inptrp, const unsigned char *inend, |
433 | unsigned char **outptrp, const unsigned char *outend, |
434 | size_t *irreversible) |
435 | { |
436 | int flags = step_data->__flags; |
437 | const unsigned char *inptr = *inptrp; |
438 | unsigned char *outptr = *outptrp; |
439 | int result; |
440 | |
441 | for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4) |
442 | { |
443 | uint32_t inval = get32 (inptr); |
444 | #if __BYTE_ORDER == __BIG_ENDIAN |
445 | inval = __builtin_bswap32 (inval); |
446 | #endif |
447 | |
448 | if (__glibc_unlikely (inval > 0x7fffffff)) |
449 | { |
450 | /* The value is too large. We don't try transliteration here since |
451 | this is not an error because of the lack of possibilities to |
452 | represent the result. This is a genuine bug in the input since |
453 | UCS4 does not allow such values. */ |
454 | if (irreversible == NULL) |
455 | /* We are transliterating, don't try to correct anything. */ |
456 | return __GCONV_ILLEGAL_INPUT; |
457 | |
458 | if (flags & __GCONV_IGNORE_ERRORS) |
459 | { |
460 | /* Just ignore this character. */ |
461 | ++*irreversible; |
462 | continue; |
463 | } |
464 | |
465 | *inptrp = inptr; |
466 | *outptrp = outptr; |
467 | return __GCONV_ILLEGAL_INPUT; |
468 | } |
469 | |
470 | put32 (outptr, inval); |
471 | outptr += sizeof (uint32_t); |
472 | } |
473 | |
474 | *inptrp = inptr; |
475 | *outptrp = outptr; |
476 | |
477 | /* Determine the status. */ |
478 | if (*inptrp == inend) |
479 | result = __GCONV_EMPTY_INPUT; |
480 | else if (*inptrp + 4 > inend) |
481 | result = __GCONV_INCOMPLETE_INPUT; |
482 | else |
483 | { |
484 | assert (*outptrp + 4 > outend); |
485 | result = __GCONV_FULL_OUTPUT; |
486 | } |
487 | |
488 | return result; |
489 | } |
490 | |
491 | |
492 | static inline int |
493 | __attribute ((always_inline)) |
494 | ucs4le_internal_loop_single (struct __gconv_step *step, |
495 | struct __gconv_step_data *step_data, |
496 | const unsigned char **inptrp, |
497 | const unsigned char *inend, |
498 | unsigned char **outptrp, |
499 | const unsigned char *outend, |
500 | size_t *irreversible) |
501 | { |
502 | mbstate_t *state = step_data->__statep; |
503 | int flags = step_data->__flags; |
504 | size_t cnt = state->__count & 7; |
505 | |
506 | while (*inptrp < inend && cnt < 4) |
507 | state->__value.__wchb[cnt++] = *(*inptrp)++; |
508 | |
509 | if (__glibc_unlikely (cnt < 4)) |
510 | { |
511 | /* Still not enough bytes. Store the ones in the input buffer. */ |
512 | state->__count &= ~7; |
513 | state->__count |= cnt; |
514 | |
515 | return __GCONV_INCOMPLETE_INPUT; |
516 | } |
517 | |
518 | if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80, |
519 | 0)) |
520 | { |
521 | /* The value is too large. We don't try transliteration here since |
522 | this is not an error because of the lack of possibilities to |
523 | represent the result. This is a genuine bug in the input since |
524 | UCS4 does not allow such values. */ |
525 | if (!(flags & __GCONV_IGNORE_ERRORS)) |
526 | return __GCONV_ILLEGAL_INPUT; |
527 | } |
528 | else |
529 | { |
530 | #if __BYTE_ORDER == __BIG_ENDIAN |
531 | (*outptrp)[0] = state->__value.__wchb[3]; |
532 | (*outptrp)[1] = state->__value.__wchb[2]; |
533 | (*outptrp)[2] = state->__value.__wchb[1]; |
534 | (*outptrp)[3] = state->__value.__wchb[0]; |
535 | #else |
536 | (*outptrp)[0] = state->__value.__wchb[0]; |
537 | (*outptrp)[1] = state->__value.__wchb[1]; |
538 | (*outptrp)[2] = state->__value.__wchb[2]; |
539 | (*outptrp)[3] = state->__value.__wchb[3]; |
540 | #endif |
541 | |
542 | *outptrp += 4; |
543 | } |
544 | |
545 | /* Clear the state buffer. */ |
546 | state->__count &= ~7; |
547 | |
548 | return __GCONV_OK; |
549 | } |
550 | |
551 | #include <iconv/skeleton.c> |
552 | |
553 | |
554 | /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */ |
555 | #define DEFINE_INIT 0 |
556 | #define DEFINE_FINI 0 |
557 | #define MIN_NEEDED_FROM 1 |
558 | #define MIN_NEEDED_TO 4 |
559 | #define FROM_DIRECTION 1 |
560 | #define FROM_LOOP ascii_internal_loop |
561 | #define TO_LOOP ascii_internal_loop /* This is not used. */ |
562 | #define FUNCTION_NAME __gconv_transform_ascii_internal |
563 | #define ONE_DIRECTION 1 |
564 | |
565 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
566 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
567 | #define LOOPFCT FROM_LOOP |
568 | #define BODY \ |
569 | { \ |
570 | if (__glibc_unlikely (*inptr > '\x7f')) \ |
571 | { \ |
572 | /* The value is too large. We don't try transliteration here since \ |
573 | this is not an error because of the lack of possibilities to \ |
574 | represent the result. This is a genuine bug in the input since \ |
575 | ASCII does not allow such values. */ \ |
576 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
577 | } \ |
578 | else \ |
579 | { \ |
580 | /* It's an one byte sequence. */ \ |
581 | *((uint32_t *) outptr) = *inptr++; \ |
582 | outptr += sizeof (uint32_t); \ |
583 | } \ |
584 | } |
585 | #define LOOP_NEED_FLAGS |
586 | #include <iconv/loop.c> |
587 | #include <iconv/skeleton.c> |
588 | |
589 | |
590 | /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */ |
591 | #define DEFINE_INIT 0 |
592 | #define DEFINE_FINI 0 |
593 | #define MIN_NEEDED_FROM 4 |
594 | #define MIN_NEEDED_TO 1 |
595 | #define FROM_DIRECTION 1 |
596 | #define FROM_LOOP internal_ascii_loop |
597 | #define TO_LOOP internal_ascii_loop /* This is not used. */ |
598 | #define FUNCTION_NAME __gconv_transform_internal_ascii |
599 | #define ONE_DIRECTION 1 |
600 | |
601 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
602 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
603 | #define LOOPFCT FROM_LOOP |
604 | #define BODY \ |
605 | { \ |
606 | if (__glibc_unlikely (*((const uint32_t *) inptr) > 0x7f)) \ |
607 | { \ |
608 | UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \ |
609 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
610 | } \ |
611 | else \ |
612 | { \ |
613 | /* It's an one byte sequence. */ \ |
614 | *outptr++ = *((const uint32_t *) inptr); \ |
615 | inptr += sizeof (uint32_t); \ |
616 | } \ |
617 | } |
618 | #define LOOP_NEED_FLAGS |
619 | #include <iconv/loop.c> |
620 | #include <iconv/skeleton.c> |
621 | |
622 | |
623 | /* Convert from the internal (UCS4-like) format to UTF-8. */ |
624 | #define DEFINE_INIT 0 |
625 | #define DEFINE_FINI 0 |
626 | #define MIN_NEEDED_FROM 4 |
627 | #define MIN_NEEDED_TO 1 |
628 | #define MAX_NEEDED_TO 6 |
629 | #define FROM_DIRECTION 1 |
630 | #define FROM_LOOP internal_utf8_loop |
631 | #define TO_LOOP internal_utf8_loop /* This is not used. */ |
632 | #define FUNCTION_NAME __gconv_transform_internal_utf8 |
633 | #define ONE_DIRECTION 1 |
634 | |
635 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
636 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
637 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO |
638 | #define LOOPFCT FROM_LOOP |
639 | #define BODY \ |
640 | { \ |
641 | uint32_t wc = *((const uint32_t *) inptr); \ |
642 | \ |
643 | if (__glibc_likely (wc < 0x80)) \ |
644 | /* It's an one byte sequence. */ \ |
645 | *outptr++ = (unsigned char) wc; \ |
646 | else if (__glibc_likely (wc <= 0x7fffffff \ |
647 | && (wc < 0xd800 || wc > 0xdfff))) \ |
648 | { \ |
649 | size_t step; \ |
650 | unsigned char *start; \ |
651 | \ |
652 | for (step = 2; step < 6; ++step) \ |
653 | if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \ |
654 | break; \ |
655 | \ |
656 | if (__glibc_unlikely (outptr + step > outend)) \ |
657 | { \ |
658 | /* Too long. */ \ |
659 | result = __GCONV_FULL_OUTPUT; \ |
660 | break; \ |
661 | } \ |
662 | \ |
663 | start = outptr; \ |
664 | *outptr = (unsigned char) (~0xff >> step); \ |
665 | outptr += step; \ |
666 | do \ |
667 | { \ |
668 | start[--step] = 0x80 | (wc & 0x3f); \ |
669 | wc >>= 6; \ |
670 | } \ |
671 | while (step > 1); \ |
672 | start[0] |= wc; \ |
673 | } \ |
674 | else \ |
675 | { \ |
676 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
677 | } \ |
678 | \ |
679 | inptr += 4; \ |
680 | } |
681 | #define LOOP_NEED_FLAGS |
682 | #include <iconv/loop.c> |
683 | #include <iconv/skeleton.c> |
684 | |
685 | |
686 | /* Convert from UTF-8 to the internal (UCS4-like) format. */ |
687 | #define DEFINE_INIT 0 |
688 | #define DEFINE_FINI 0 |
689 | #define MIN_NEEDED_FROM 1 |
690 | #define MAX_NEEDED_FROM 6 |
691 | #define MIN_NEEDED_TO 4 |
692 | #define FROM_DIRECTION 1 |
693 | #define FROM_LOOP utf8_internal_loop |
694 | #define TO_LOOP utf8_internal_loop /* This is not used. */ |
695 | #define FUNCTION_NAME __gconv_transform_utf8_internal |
696 | #define ONE_DIRECTION 1 |
697 | |
698 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
699 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM |
700 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
701 | #define LOOPFCT FROM_LOOP |
702 | #define BODY \ |
703 | { \ |
704 | /* Next input byte. */ \ |
705 | uint32_t ch = *inptr; \ |
706 | \ |
707 | if (__glibc_likely (ch < 0x80)) \ |
708 | { \ |
709 | /* One byte sequence. */ \ |
710 | ++inptr; \ |
711 | } \ |
712 | else \ |
713 | { \ |
714 | unsigned int cnt; \ |
715 | unsigned int i; \ |
716 | \ |
717 | if (ch >= 0xc2 && ch < 0xe0) \ |
718 | { \ |
719 | /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \ |
720 | otherwise the wide character could have been represented \ |
721 | using a single byte. */ \ |
722 | cnt = 2; \ |
723 | ch &= 0x1f; \ |
724 | } \ |
725 | else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \ |
726 | { \ |
727 | /* We expect three bytes. */ \ |
728 | cnt = 3; \ |
729 | ch &= 0x0f; \ |
730 | } \ |
731 | else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \ |
732 | { \ |
733 | /* We expect four bytes. */ \ |
734 | cnt = 4; \ |
735 | ch &= 0x07; \ |
736 | } \ |
737 | else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \ |
738 | { \ |
739 | /* We expect five bytes. */ \ |
740 | cnt = 5; \ |
741 | ch &= 0x03; \ |
742 | } \ |
743 | else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \ |
744 | { \ |
745 | /* We expect six bytes. */ \ |
746 | cnt = 6; \ |
747 | ch &= 0x01; \ |
748 | } \ |
749 | else \ |
750 | { \ |
751 | /* Search the end of this ill-formed UTF-8 character. This \ |
752 | is the next byte with (x & 0xc0) != 0x80. */ \ |
753 | i = 0; \ |
754 | do \ |
755 | ++i; \ |
756 | while (inptr + i < inend \ |
757 | && (*(inptr + i) & 0xc0) == 0x80 \ |
758 | && i < 5); \ |
759 | \ |
760 | errout: \ |
761 | STANDARD_FROM_LOOP_ERR_HANDLER (i); \ |
762 | } \ |
763 | \ |
764 | if (__glibc_unlikely (inptr + cnt > inend)) \ |
765 | { \ |
766 | /* We don't have enough input. But before we report that check \ |
767 | that all the bytes are correct. */ \ |
768 | for (i = 1; inptr + i < inend; ++i) \ |
769 | if ((inptr[i] & 0xc0) != 0x80) \ |
770 | break; \ |
771 | \ |
772 | if (__glibc_likely (inptr + i == inend)) \ |
773 | { \ |
774 | result = __GCONV_INCOMPLETE_INPUT; \ |
775 | break; \ |
776 | } \ |
777 | \ |
778 | goto errout; \ |
779 | } \ |
780 | \ |
781 | /* Read the possible remaining bytes. */ \ |
782 | for (i = 1; i < cnt; ++i) \ |
783 | { \ |
784 | uint32_t byte = inptr[i]; \ |
785 | \ |
786 | if ((byte & 0xc0) != 0x80) \ |
787 | /* This is an illegal encoding. */ \ |
788 | break; \ |
789 | \ |
790 | ch <<= 6; \ |
791 | ch |= byte & 0x3f; \ |
792 | } \ |
793 | \ |
794 | /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ |
795 | If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ |
796 | have been represented with fewer than cnt bytes. */ \ |
797 | if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \ |
798 | /* Do not accept UTF-16 surrogates. */ \ |
799 | || (ch >= 0xd800 && ch <= 0xdfff)) \ |
800 | { \ |
801 | /* This is an illegal encoding. */ \ |
802 | goto errout; \ |
803 | } \ |
804 | \ |
805 | inptr += cnt; \ |
806 | } \ |
807 | \ |
808 | /* Now adjust the pointers and store the result. */ \ |
809 | *((uint32_t *) outptr) = ch; \ |
810 | outptr += sizeof (uint32_t); \ |
811 | } |
812 | #define LOOP_NEED_FLAGS |
813 | |
814 | #define STORE_REST \ |
815 | { \ |
816 | /* We store the remaining bytes while converting them into the UCS4 \ |
817 | format. We can assume that the first byte in the buffer is \ |
818 | correct and that it requires a larger number of bytes than there \ |
819 | are in the input buffer. */ \ |
820 | wint_t ch = **inptrp; \ |
821 | size_t cnt, r; \ |
822 | \ |
823 | state->__count = inend - *inptrp; \ |
824 | \ |
825 | assert (ch != 0xc0 && ch != 0xc1); \ |
826 | if (ch >= 0xc2 && ch < 0xe0) \ |
827 | { \ |
828 | /* We expect two bytes. The first byte cannot be 0xc0 or \ |
829 | 0xc1, otherwise the wide character could have been \ |
830 | represented using a single byte. */ \ |
831 | cnt = 2; \ |
832 | ch &= 0x1f; \ |
833 | } \ |
834 | else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \ |
835 | { \ |
836 | /* We expect three bytes. */ \ |
837 | cnt = 3; \ |
838 | ch &= 0x0f; \ |
839 | } \ |
840 | else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \ |
841 | { \ |
842 | /* We expect four bytes. */ \ |
843 | cnt = 4; \ |
844 | ch &= 0x07; \ |
845 | } \ |
846 | else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \ |
847 | { \ |
848 | /* We expect five bytes. */ \ |
849 | cnt = 5; \ |
850 | ch &= 0x03; \ |
851 | } \ |
852 | else \ |
853 | { \ |
854 | /* We expect six bytes. */ \ |
855 | cnt = 6; \ |
856 | ch &= 0x01; \ |
857 | } \ |
858 | \ |
859 | /* The first byte is already consumed. */ \ |
860 | r = cnt - 1; \ |
861 | while (++(*inptrp) < inend) \ |
862 | { \ |
863 | ch <<= 6; \ |
864 | ch |= **inptrp & 0x3f; \ |
865 | --r; \ |
866 | } \ |
867 | \ |
868 | /* Shift for the so far missing bytes. */ \ |
869 | ch <<= r * 6; \ |
870 | \ |
871 | /* Store the number of bytes expected for the entire sequence. */ \ |
872 | state->__count |= cnt << 8; \ |
873 | \ |
874 | /* Store the value. */ \ |
875 | state->__value.__wch = ch; \ |
876 | } |
877 | |
878 | #define UNPACK_BYTES \ |
879 | { \ |
880 | static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ |
881 | wint_t wch = state->__value.__wch; \ |
882 | size_t ntotal = state->__count >> 8; \ |
883 | \ |
884 | inlen = state->__count & 255; \ |
885 | \ |
886 | bytebuf[0] = inmask[ntotal - 2]; \ |
887 | \ |
888 | do \ |
889 | { \ |
890 | if (--ntotal < inlen) \ |
891 | bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ |
892 | wch >>= 6; \ |
893 | } \ |
894 | while (ntotal > 1); \ |
895 | \ |
896 | bytebuf[0] |= wch; \ |
897 | } |
898 | |
899 | #define CLEAR_STATE \ |
900 | state->__count = 0 |
901 | |
902 | |
903 | #include <iconv/loop.c> |
904 | #include <iconv/skeleton.c> |
905 | |
906 | |
907 | /* Convert from UCS2 to the internal (UCS4-like) format. */ |
908 | #define DEFINE_INIT 0 |
909 | #define DEFINE_FINI 0 |
910 | #define MIN_NEEDED_FROM 2 |
911 | #define MIN_NEEDED_TO 4 |
912 | #define FROM_DIRECTION 1 |
913 | #define FROM_LOOP ucs2_internal_loop |
914 | #define TO_LOOP ucs2_internal_loop /* This is not used. */ |
915 | #define FUNCTION_NAME __gconv_transform_ucs2_internal |
916 | #define ONE_DIRECTION 1 |
917 | |
918 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
919 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
920 | #define LOOPFCT FROM_LOOP |
921 | #define BODY \ |
922 | { \ |
923 | uint16_t u1 = get16 (inptr); \ |
924 | \ |
925 | if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \ |
926 | { \ |
927 | /* Surrogate characters in UCS-2 input are not valid. Reject \ |
928 | them. (Catching this here is not security relevant.) */ \ |
929 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ |
930 | } \ |
931 | \ |
932 | *((uint32_t *) outptr) = u1; \ |
933 | outptr += sizeof (uint32_t); \ |
934 | inptr += 2; \ |
935 | } |
936 | #define LOOP_NEED_FLAGS |
937 | #include <iconv/loop.c> |
938 | #include <iconv/skeleton.c> |
939 | |
940 | |
941 | /* Convert from the internal (UCS4-like) format to UCS2. */ |
942 | #define DEFINE_INIT 0 |
943 | #define DEFINE_FINI 0 |
944 | #define MIN_NEEDED_FROM 4 |
945 | #define MIN_NEEDED_TO 2 |
946 | #define FROM_DIRECTION 1 |
947 | #define FROM_LOOP internal_ucs2_loop |
948 | #define TO_LOOP internal_ucs2_loop /* This is not used. */ |
949 | #define FUNCTION_NAME __gconv_transform_internal_ucs2 |
950 | #define ONE_DIRECTION 1 |
951 | |
952 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
953 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
954 | #define LOOPFCT FROM_LOOP |
955 | #define BODY \ |
956 | { \ |
957 | uint32_t val = *((const uint32_t *) inptr); \ |
958 | \ |
959 | if (__glibc_unlikely (val >= 0x10000)) \ |
960 | { \ |
961 | UNICODE_TAG_HANDLER (val, 4); \ |
962 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
963 | } \ |
964 | else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \ |
965 | { \ |
966 | /* Surrogate characters in UCS-4 input are not valid. \ |
967 | We must catch this, because the UCS-2 output might be \ |
968 | interpreted as UTF-16 by other programs. If we let \ |
969 | surrogates pass through, attackers could make a security \ |
970 | hole exploit by synthesizing any desired plane 1-16 \ |
971 | character. */ \ |
972 | result = __GCONV_ILLEGAL_INPUT; \ |
973 | if (! ignore_errors_p ()) \ |
974 | break; \ |
975 | inptr += 4; \ |
976 | ++*irreversible; \ |
977 | continue; \ |
978 | } \ |
979 | else \ |
980 | { \ |
981 | put16 (outptr, val); \ |
982 | outptr += sizeof (uint16_t); \ |
983 | inptr += 4; \ |
984 | } \ |
985 | } |
986 | #define LOOP_NEED_FLAGS |
987 | #include <iconv/loop.c> |
988 | #include <iconv/skeleton.c> |
989 | |
990 | |
991 | /* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */ |
992 | #define DEFINE_INIT 0 |
993 | #define DEFINE_FINI 0 |
994 | #define MIN_NEEDED_FROM 2 |
995 | #define MIN_NEEDED_TO 4 |
996 | #define FROM_DIRECTION 1 |
997 | #define FROM_LOOP ucs2reverse_internal_loop |
998 | #define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/ |
999 | #define FUNCTION_NAME __gconv_transform_ucs2reverse_internal |
1000 | #define ONE_DIRECTION 1 |
1001 | |
1002 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
1003 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
1004 | #define LOOPFCT FROM_LOOP |
1005 | #define BODY \ |
1006 | { \ |
1007 | uint16_t u1 = bswap_16 (get16 (inptr)); \ |
1008 | \ |
1009 | if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \ |
1010 | { \ |
1011 | /* Surrogate characters in UCS-2 input are not valid. Reject \ |
1012 | them. (Catching this here is not security relevant.) */ \ |
1013 | if (! ignore_errors_p ()) \ |
1014 | { \ |
1015 | result = __GCONV_ILLEGAL_INPUT; \ |
1016 | break; \ |
1017 | } \ |
1018 | inptr += 2; \ |
1019 | ++*irreversible; \ |
1020 | continue; \ |
1021 | } \ |
1022 | \ |
1023 | *((uint32_t *) outptr) = u1; \ |
1024 | outptr += sizeof (uint32_t); \ |
1025 | inptr += 2; \ |
1026 | } |
1027 | #define LOOP_NEED_FLAGS |
1028 | #include <iconv/loop.c> |
1029 | #include <iconv/skeleton.c> |
1030 | |
1031 | |
1032 | /* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */ |
1033 | #define DEFINE_INIT 0 |
1034 | #define DEFINE_FINI 0 |
1035 | #define MIN_NEEDED_FROM 4 |
1036 | #define MIN_NEEDED_TO 2 |
1037 | #define FROM_DIRECTION 1 |
1038 | #define FROM_LOOP internal_ucs2reverse_loop |
1039 | #define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/ |
1040 | #define FUNCTION_NAME __gconv_transform_internal_ucs2reverse |
1041 | #define ONE_DIRECTION 1 |
1042 | |
1043 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
1044 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
1045 | #define LOOPFCT FROM_LOOP |
1046 | #define BODY \ |
1047 | { \ |
1048 | uint32_t val = *((const uint32_t *) inptr); \ |
1049 | if (__glibc_unlikely (val >= 0x10000)) \ |
1050 | { \ |
1051 | UNICODE_TAG_HANDLER (val, 4); \ |
1052 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
1053 | } \ |
1054 | else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \ |
1055 | { \ |
1056 | /* Surrogate characters in UCS-4 input are not valid. \ |
1057 | We must catch this, because the UCS-2 output might be \ |
1058 | interpreted as UTF-16 by other programs. If we let \ |
1059 | surrogates pass through, attackers could make a security \ |
1060 | hole exploit by synthesizing any desired plane 1-16 \ |
1061 | character. */ \ |
1062 | if (! ignore_errors_p ()) \ |
1063 | { \ |
1064 | result = __GCONV_ILLEGAL_INPUT; \ |
1065 | break; \ |
1066 | } \ |
1067 | inptr += 4; \ |
1068 | ++*irreversible; \ |
1069 | continue; \ |
1070 | } \ |
1071 | else \ |
1072 | { \ |
1073 | put16 (outptr, bswap_16 (val)); \ |
1074 | outptr += sizeof (uint16_t); \ |
1075 | inptr += 4; \ |
1076 | } \ |
1077 | } |
1078 | #define LOOP_NEED_FLAGS |
1079 | #include <iconv/loop.c> |
1080 | #include <iconv/skeleton.c> |
1081 | |