1 | /* Conversion loop frame work. |
2 | Copyright (C) 1998-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* This file provides a frame for the reader loop in all conversion modules. |
20 | The actual code must (of course) be provided in the actual module source |
21 | code but certain actions can be written down generically, with some |
22 | customization options which are these: |
23 | |
24 | MIN_NEEDED_INPUT minimal number of input bytes needed for the next |
25 | conversion. |
26 | MIN_NEEDED_OUTPUT minimal number of bytes produced by the next round |
27 | of conversion. |
28 | |
29 | MAX_NEEDED_INPUT you guess it, this is the maximal number of input |
30 | bytes needed. It defaults to MIN_NEEDED_INPUT |
31 | MAX_NEEDED_OUTPUT likewise for output bytes. |
32 | |
33 | LOOPFCT name of the function created. If not specified |
34 | the name is `loop' but this prevents the use |
35 | of multiple functions in the same file. |
36 | |
37 | BODY this is supposed to expand to the body of the loop. |
38 | The user must provide this. |
39 | |
40 | EXTRA_LOOP_DECLS extra arguments passed from conversion loop call. |
41 | |
42 | INIT_PARAMS code to define and initialize variables from params. |
43 | UPDATE_PARAMS code to store result in params. |
44 | |
45 | ONEBYTE_BODY body of the specialized conversion function for a |
46 | single byte from the current character set to INTERNAL. |
47 | */ |
48 | |
49 | #include <assert.h> |
50 | #include <endian.h> |
51 | #include <iconv/gconv_int.h> |
52 | #include <stdint.h> |
53 | #include <string.h> |
54 | #include <wchar.h> |
55 | #include <sys/param.h> /* For MIN. */ |
56 | #define __need_size_t |
57 | #include <stddef.h> |
58 | #include <libc-diag.h> |
59 | |
60 | /* We have to provide support for machines which are not able to handled |
61 | unaligned memory accesses. Some of the character encodings have |
62 | representations with a fixed width of 2 or 4 bytes. But if we cannot |
63 | access unaligned memory we still have to read byte-wise. */ |
64 | #undef FCTNAME2 |
65 | #if _STRING_ARCH_unaligned || !defined DEFINE_UNALIGNED |
66 | /* We can handle unaligned memory access. */ |
67 | # define get16(addr) *((const uint16_t *) (addr)) |
68 | # define get32(addr) *((const uint32_t *) (addr)) |
69 | |
70 | /* We need no special support for writing values either. */ |
71 | # define put16(addr, val) *((uint16_t *) (addr)) = (val) |
72 | # define put32(addr, val) *((uint32_t *) (addr)) = (val) |
73 | |
74 | # define FCTNAME2(name) name |
75 | #else |
76 | /* Distinguish between big endian and little endian. */ |
77 | # if __BYTE_ORDER == __LITTLE_ENDIAN |
78 | # define get16(addr) \ |
79 | (((const unsigned char *) (addr))[1] << 8 \ |
80 | | ((const unsigned char *) (addr))[0]) |
81 | # define get32(addr) \ |
82 | (((((const unsigned char *) (addr))[3] << 8 \ |
83 | | ((const unsigned char *) (addr))[2]) << 8 \ |
84 | | ((const unsigned char *) (addr))[1]) << 8 \ |
85 | | ((const unsigned char *) (addr))[0]) |
86 | |
87 | # define put16(addr, val) \ |
88 | ({ uint16_t __val = (val); \ |
89 | ((unsigned char *) (addr))[0] = __val; \ |
90 | ((unsigned char *) (addr))[1] = __val >> 8; \ |
91 | (void) 0; }) |
92 | # define put32(addr, val) \ |
93 | ({ uint32_t __val = (val); \ |
94 | ((unsigned char *) (addr))[0] = __val; \ |
95 | __val >>= 8; \ |
96 | ((unsigned char *) (addr))[1] = __val; \ |
97 | __val >>= 8; \ |
98 | ((unsigned char *) (addr))[2] = __val; \ |
99 | __val >>= 8; \ |
100 | ((unsigned char *) (addr))[3] = __val; \ |
101 | (void) 0; }) |
102 | # else |
103 | # define get16(addr) \ |
104 | (((const unsigned char *) (addr))[0] << 8 \ |
105 | | ((const unsigned char *) (addr))[1]) |
106 | # define get32(addr) \ |
107 | (((((const unsigned char *) (addr))[0] << 8 \ |
108 | | ((const unsigned char *) (addr))[1]) << 8 \ |
109 | | ((const unsigned char *) (addr))[2]) << 8 \ |
110 | | ((const unsigned char *) (addr))[3]) |
111 | |
112 | # define put16(addr, val) \ |
113 | ({ uint16_t __val = (val); \ |
114 | ((unsigned char *) (addr))[1] = __val; \ |
115 | ((unsigned char *) (addr))[0] = __val >> 8; \ |
116 | (void) 0; }) |
117 | # define put32(addr, val) \ |
118 | ({ uint32_t __val = (val); \ |
119 | ((unsigned char *) (addr))[3] = __val; \ |
120 | __val >>= 8; \ |
121 | ((unsigned char *) (addr))[2] = __val; \ |
122 | __val >>= 8; \ |
123 | ((unsigned char *) (addr))[1] = __val; \ |
124 | __val >>= 8; \ |
125 | ((unsigned char *) (addr))[0] = __val; \ |
126 | (void) 0; }) |
127 | # endif |
128 | |
129 | # define FCTNAME2(name) name##_unaligned |
130 | #endif |
131 | #define FCTNAME(name) FCTNAME2(name) |
132 | |
133 | |
134 | /* We need at least one byte for the next round. */ |
135 | #ifndef MIN_NEEDED_INPUT |
136 | # error "MIN_NEEDED_INPUT definition missing" |
137 | #elif MIN_NEEDED_INPUT < 1 |
138 | # error "MIN_NEEDED_INPUT must be >= 1" |
139 | #endif |
140 | |
141 | /* Let's see how many bytes we produce. */ |
142 | #ifndef MAX_NEEDED_INPUT |
143 | # define MAX_NEEDED_INPUT MIN_NEEDED_INPUT |
144 | #endif |
145 | |
146 | /* We produce at least one byte in the next round. */ |
147 | #ifndef MIN_NEEDED_OUTPUT |
148 | # error "MIN_NEEDED_OUTPUT definition missing" |
149 | #elif MIN_NEEDED_OUTPUT < 1 |
150 | # error "MIN_NEEDED_OUTPUT must be >= 1" |
151 | #endif |
152 | |
153 | /* Let's see how many bytes we produce. */ |
154 | #ifndef MAX_NEEDED_OUTPUT |
155 | # define MAX_NEEDED_OUTPUT MIN_NEEDED_OUTPUT |
156 | #endif |
157 | |
158 | /* Default name for the function. */ |
159 | #ifndef LOOPFCT |
160 | # define LOOPFCT loop |
161 | #endif |
162 | |
163 | /* Make sure we have a loop body. */ |
164 | #ifndef BODY |
165 | # error "Definition of BODY missing for function" LOOPFCT |
166 | #endif |
167 | |
168 | |
169 | /* If no arguments have to passed to the loop function define the macro |
170 | as empty. */ |
171 | #ifndef EXTRA_LOOP_DECLS |
172 | # define |
173 | #endif |
174 | |
175 | /* Allow using UPDATE_PARAMS in macros where #ifdef UPDATE_PARAMS test |
176 | isn't possible. */ |
177 | #ifndef UPDATE_PARAMS |
178 | # define UPDATE_PARAMS do { } while (0) |
179 | #endif |
180 | #ifndef REINIT_PARAMS |
181 | # define REINIT_PARAMS do { } while (0) |
182 | #endif |
183 | |
184 | |
185 | /* To make it easier for the writers of the modules, we define a macro |
186 | to test whether we have to ignore errors. */ |
187 | #define ignore_errors_p() \ |
188 | (irreversible != NULL && (flags & __GCONV_IGNORE_ERRORS)) |
189 | |
190 | |
191 | /* Error handling for the FROM_LOOP direction, with ignoring of errors. |
192 | Note that we cannot use the do while (0) trick since `break' and |
193 | `continue' must reach certain points. */ |
194 | #define STANDARD_FROM_LOOP_ERR_HANDLER(Incr) \ |
195 | { \ |
196 | result = __GCONV_ILLEGAL_INPUT; \ |
197 | \ |
198 | if (! ignore_errors_p ()) \ |
199 | break; \ |
200 | \ |
201 | /* We ignore the invalid input byte sequence. */ \ |
202 | inptr += (Incr); \ |
203 | ++*irreversible; \ |
204 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
205 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
206 | continue; \ |
207 | } |
208 | |
209 | /* Error handling for the TO_LOOP direction, with use of transliteration/ |
210 | transcription functions and ignoring of errors. Note that we cannot use |
211 | the do while (0) trick since `break' and `continue' must reach certain |
212 | points. */ |
213 | #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \ |
214 | { \ |
215 | result = __GCONV_ILLEGAL_INPUT; \ |
216 | \ |
217 | if (irreversible == NULL) \ |
218 | /* This means we are in call from __gconv_transliterate. In this \ |
219 | case we are not doing any error recovery outself. */ \ |
220 | break; \ |
221 | \ |
222 | /* If needed, flush any conversion state, so that __gconv_transliterate \ |
223 | starts with current shift state. */ \ |
224 | UPDATE_PARAMS; \ |
225 | \ |
226 | /* First try the transliteration methods. */ \ |
227 | if ((step_data->__flags & __GCONV_TRANSLIT) != 0) \ |
228 | result = __gconv_transliterate \ |
229 | (step, step_data, *inptrp, \ |
230 | &inptr, inend, &outptr, irreversible); \ |
231 | \ |
232 | REINIT_PARAMS; \ |
233 | \ |
234 | /* If any of them recognized the input continue with the loop. */ \ |
235 | if (result != __GCONV_ILLEGAL_INPUT) \ |
236 | { \ |
237 | if (__glibc_unlikely (result == __GCONV_FULL_OUTPUT)) \ |
238 | break; \ |
239 | \ |
240 | continue; \ |
241 | } \ |
242 | \ |
243 | /* Next see whether we have to ignore the error. If not, stop. */ \ |
244 | if (! ignore_errors_p ()) \ |
245 | break; \ |
246 | \ |
247 | /* When we come here it means we ignore the character. */ \ |
248 | ++*irreversible; \ |
249 | inptr += Incr; \ |
250 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
251 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
252 | continue; \ |
253 | } |
254 | |
255 | |
256 | /* With GCC 7 when compiling with -Os for 32-bit s390 the compiler |
257 | warns that the variable 'ch', in the definition of BODY in |
258 | sysdeps/s390/multiarch/8bit-generic.c, may be used uninitialized in |
259 | the call to UNICODE_TAG_HANDLER in that macro. This variable is |
260 | actually always initialized before use, in the prior loop if INDEX |
261 | is nonzero and in the following 'if' if INDEX is zero. That code |
262 | has a comment referencing this diagnostic disabling; updates in one |
263 | place may require updates in the other. */ |
264 | DIAG_PUSH_NEEDS_COMMENT; |
265 | DIAG_IGNORE_Os_NEEDS_COMMENT (7, "-Wmaybe-uninitialized" ); |
266 | /* Handling of Unicode 3.1 TAG characters. Unicode recommends |
267 | "If language codes are not relevant to the particular processing |
268 | operation, then they should be ignored." This macro is usually |
269 | called right before STANDARD_TO_LOOP_ERR_HANDLER (Incr). */ |
270 | #define UNICODE_TAG_HANDLER(Character, Incr) \ |
271 | { \ |
272 | /* TAG characters are those in the range U+E0000..U+E007F. */ \ |
273 | if (((Character) >> 7) == (0xe0000 >> 7)) \ |
274 | { \ |
275 | inptr += Incr; \ |
276 | continue; \ |
277 | } \ |
278 | } |
279 | DIAG_POP_NEEDS_COMMENT; |
280 | |
281 | |
282 | /* The function returns the status, as defined in gconv.h. */ |
283 | static inline int |
284 | __attribute ((always_inline)) |
285 | FCTNAME (LOOPFCT) (struct __gconv_step *step, |
286 | struct __gconv_step_data *step_data, |
287 | const unsigned char **inptrp, const unsigned char *inend, |
288 | unsigned char **outptrp, const unsigned char *outend, |
289 | size_t *irreversible EXTRA_LOOP_DECLS) |
290 | { |
291 | #ifdef LOOP_NEED_STATE |
292 | mbstate_t *state = step_data->__statep; |
293 | #endif |
294 | #ifdef LOOP_NEED_FLAGS |
295 | int flags = step_data->__flags; |
296 | #endif |
297 | #ifdef LOOP_NEED_DATA |
298 | void *data = step->__data; |
299 | #endif |
300 | int result = __GCONV_EMPTY_INPUT; |
301 | const unsigned char *inptr = *inptrp; |
302 | unsigned char *outptr = *outptrp; |
303 | |
304 | #ifdef INIT_PARAMS |
305 | INIT_PARAMS; |
306 | #endif |
307 | |
308 | while (inptr != inend) |
309 | { |
310 | /* `if' cases for MIN_NEEDED_OUTPUT ==/!= 1 is made to help the |
311 | compiler generating better code. They will be optimized away |
312 | since MIN_NEEDED_OUTPUT is always a constant. */ |
313 | if (MIN_NEEDED_INPUT > 1 |
314 | && __builtin_expect (inptr + MIN_NEEDED_INPUT > inend, 0)) |
315 | { |
316 | /* We don't have enough input for another complete input |
317 | character. */ |
318 | result = __GCONV_INCOMPLETE_INPUT; |
319 | break; |
320 | } |
321 | if ((MIN_NEEDED_OUTPUT != 1 |
322 | && __builtin_expect (outptr + MIN_NEEDED_OUTPUT > outend, 0)) |
323 | || (MIN_NEEDED_OUTPUT == 1 |
324 | && __builtin_expect (outptr >= outend, 0))) |
325 | { |
326 | /* Overflow in the output buffer. */ |
327 | result = __GCONV_FULL_OUTPUT; |
328 | break; |
329 | } |
330 | |
331 | /* Here comes the body the user provides. It can stop with |
332 | RESULT set to GCONV_INCOMPLETE_INPUT (if the size of the |
333 | input characters vary in size), GCONV_ILLEGAL_INPUT, or |
334 | GCONV_FULL_OUTPUT (if the output characters vary in size). */ |
335 | BODY |
336 | } |
337 | |
338 | /* Update the pointers pointed to by the parameters. */ |
339 | *inptrp = inptr; |
340 | *outptrp = outptr; |
341 | UPDATE_PARAMS; |
342 | |
343 | return result; |
344 | } |
345 | |
346 | |
347 | /* Include the file a second time to define the function to handle |
348 | unaligned access. */ |
349 | #if !defined DEFINE_UNALIGNED && !_STRING_ARCH_unaligned \ |
350 | && MIN_NEEDED_INPUT != 1 && MAX_NEEDED_INPUT % MIN_NEEDED_INPUT == 0 \ |
351 | && MIN_NEEDED_OUTPUT != 1 && MAX_NEEDED_OUTPUT % MIN_NEEDED_OUTPUT == 0 |
352 | # undef get16 |
353 | # undef get32 |
354 | # undef put16 |
355 | # undef put32 |
356 | # undef unaligned |
357 | |
358 | # define DEFINE_UNALIGNED |
359 | # include "loop.c" |
360 | # undef DEFINE_UNALIGNED |
361 | #else |
362 | # if MAX_NEEDED_INPUT > 1 |
363 | # define SINGLE(fct) SINGLE2 (fct) |
364 | # define SINGLE2(fct) fct##_single |
365 | static inline int |
366 | __attribute ((always_inline)) |
367 | SINGLE(LOOPFCT) (struct __gconv_step *step, |
368 | struct __gconv_step_data *step_data, |
369 | const unsigned char **inptrp, const unsigned char *inend, |
370 | unsigned char **outptrp, unsigned char *outend, |
371 | size_t *irreversible EXTRA_LOOP_DECLS) |
372 | { |
373 | mbstate_t *state = step_data->__statep; |
374 | # ifdef LOOP_NEED_FLAGS |
375 | int flags = step_data->__flags; |
376 | # endif |
377 | # ifdef LOOP_NEED_DATA |
378 | void *data = step->__data; |
379 | # endif |
380 | int result = __GCONV_OK; |
381 | unsigned char bytebuf[MAX_NEEDED_INPUT]; |
382 | const unsigned char *inptr = *inptrp; |
383 | unsigned char *outptr = *outptrp; |
384 | size_t inlen; |
385 | |
386 | # ifdef INIT_PARAMS |
387 | INIT_PARAMS; |
388 | # endif |
389 | |
390 | # ifdef UNPACK_BYTES |
391 | UNPACK_BYTES |
392 | # else |
393 | /* Add the bytes from the state to the input buffer. */ |
394 | assert ((state->__count & 7) <= sizeof (state->__value)); |
395 | for (inlen = 0; inlen < (size_t) (state->__count & 7); ++inlen) |
396 | bytebuf[inlen] = state->__value.__wchb[inlen]; |
397 | # endif |
398 | |
399 | /* Are there enough bytes in the input buffer? */ |
400 | if (MIN_NEEDED_INPUT > 1 |
401 | && __builtin_expect (inptr + (MIN_NEEDED_INPUT - inlen) > inend, 0)) |
402 | { |
403 | *inptrp = inend; |
404 | # ifdef STORE_REST |
405 | |
406 | /* Building with -O3 GCC emits a `array subscript is above array |
407 | bounds' warning. GCC BZ #64739 has been opened for this. */ |
408 | DIAG_PUSH_NEEDS_COMMENT; |
409 | DIAG_IGNORE_NEEDS_COMMENT (4.9, "-Warray-bounds" ); |
410 | while (inptr < inend) |
411 | bytebuf[inlen++] = *inptr++; |
412 | DIAG_POP_NEEDS_COMMENT; |
413 | |
414 | inptr = bytebuf; |
415 | inptrp = &inptr; |
416 | inend = &bytebuf[inlen]; |
417 | |
418 | STORE_REST |
419 | # else |
420 | /* We don't have enough input for another complete input |
421 | character. */ |
422 | size_t inlen_after = inlen + (inend - inptr); |
423 | assert (inlen_after <= sizeof (state->__value.__wchb)); |
424 | for (; inlen < inlen_after; inlen++) |
425 | state->__value.__wchb[inlen] = *inptr++; |
426 | # endif |
427 | |
428 | return __GCONV_INCOMPLETE_INPUT; |
429 | } |
430 | |
431 | /* Enough space in output buffer. */ |
432 | if ((MIN_NEEDED_OUTPUT != 1 && outptr + MIN_NEEDED_OUTPUT > outend) |
433 | || (MIN_NEEDED_OUTPUT == 1 && outptr >= outend)) |
434 | /* Overflow in the output buffer. */ |
435 | return __GCONV_FULL_OUTPUT; |
436 | |
437 | /* Now add characters from the normal input buffer. */ |
438 | if (inlen >= MAX_NEEDED_INPUT || inptr >= inend) |
439 | /* Avoid a -Wstringop-overflow= warning when this loop is |
440 | unrolled. The compiler cannot otherwise see that this is |
441 | unreachable because it depends on (state->__count & 7) not |
442 | being too large after a previous conversion step. |
443 | Starting with GCC 12, we also have mark the inptr >= inend |
444 | case as unreachable to omit the warning. Note that this SINGLE |
445 | function is only used to implement the mb*towc*() or wc*tomb*() |
446 | functions. Those functions use inptr and inend pointing to a |
447 | variable on stack, compute the inend pointer or explicitly check |
448 | the arguments which always leads to inptr < inend. */ |
449 | __builtin_unreachable (); |
450 | do |
451 | bytebuf[inlen++] = *inptr++; |
452 | while (inlen < MAX_NEEDED_INPUT && inptr < inend); |
453 | |
454 | inptr = bytebuf; |
455 | inend = &bytebuf[inlen]; |
456 | |
457 | do |
458 | { |
459 | BODY |
460 | } |
461 | while (0); |
462 | |
463 | /* Now we either have produced an output character and consumed all the |
464 | bytes from the state and at least one more, or the character is still |
465 | incomplete, or we have some other error (like illegal input character, |
466 | no space in output buffer). */ |
467 | if (__glibc_likely (inptr != bytebuf)) |
468 | { |
469 | /* We found a new character. */ |
470 | assert (inptr - bytebuf > (state->__count & 7)); |
471 | |
472 | *inptrp += inptr - bytebuf - (state->__count & 7); |
473 | *outptrp = outptr; |
474 | |
475 | result = __GCONV_OK; |
476 | |
477 | /* Clear the state buffer. */ |
478 | # ifdef CLEAR_STATE |
479 | CLEAR_STATE; |
480 | # else |
481 | state->__count &= ~7; |
482 | # endif |
483 | } |
484 | else if (result == __GCONV_INCOMPLETE_INPUT) |
485 | { |
486 | /* This can only happen if we have less than MAX_NEEDED_INPUT bytes |
487 | available. */ |
488 | assert (inend != &bytebuf[MAX_NEEDED_INPUT]); |
489 | |
490 | *inptrp += inend - bytebuf - (state->__count & 7); |
491 | # ifdef STORE_REST |
492 | inptrp = &inptr; |
493 | |
494 | STORE_REST |
495 | # else |
496 | /* We don't have enough input for another complete input |
497 | character. */ |
498 | assert (inend - inptr > (state->__count & ~7)); |
499 | assert (inend - inptr <= sizeof (state->__value.__wchb)); |
500 | state->__count = (state->__count & ~7) | (inend - inptr); |
501 | for (inlen = 0; inlen < inend - inptr; inlen++) |
502 | state->__value.__wchb[inlen] = inptr[inlen]; |
503 | inptr = inend; |
504 | # endif |
505 | } |
506 | |
507 | return result; |
508 | } |
509 | # undef SINGLE |
510 | # undef SINGLE2 |
511 | # endif |
512 | |
513 | |
514 | # ifdef ONEBYTE_BODY |
515 | /* Define the shortcut function for btowc. */ |
516 | static wint_t |
517 | gconv_btowc (struct __gconv_step *step, unsigned char c) |
518 | ONEBYTE_BODY |
519 | # define FROM_ONEBYTE gconv_btowc |
520 | # endif |
521 | |
522 | #endif |
523 | |
524 | /* We remove the macro definitions so that we can include this file again |
525 | for the definition of another function. */ |
526 | #undef MIN_NEEDED_INPUT |
527 | #undef MAX_NEEDED_INPUT |
528 | #undef MIN_NEEDED_OUTPUT |
529 | #undef MAX_NEEDED_OUTPUT |
530 | #undef LOOPFCT |
531 | #undef BODY |
532 | #undef LOOPFCT |
533 | #undef EXTRA_LOOP_DECLS |
534 | #undef INIT_PARAMS |
535 | #undef UPDATE_PARAMS |
536 | #undef REINIT_PARAMS |
537 | #undef ONEBYTE_BODY |
538 | #undef UNPACK_BYTES |
539 | #undef CLEAR_STATE |
540 | #undef LOOP_NEED_STATE |
541 | #undef LOOP_NEED_FLAGS |
542 | #undef LOOP_NEED_DATA |
543 | #undef get16 |
544 | #undef get32 |
545 | #undef put16 |
546 | #undef put32 |
547 | #undef unaligned |
548 | |