1 | /* Conversion loop frame work. |
2 | Copyright (C) 1998-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | /* This file provides a frame for the reader loop in all conversion modules. |
21 | The actual code must (of course) be provided in the actual module source |
22 | code but certain actions can be written down generically, with some |
23 | customization options which are these: |
24 | |
25 | MIN_NEEDED_INPUT minimal number of input bytes needed for the next |
26 | conversion. |
27 | MIN_NEEDED_OUTPUT minimal number of bytes produced by the next round |
28 | of conversion. |
29 | |
30 | MAX_NEEDED_INPUT you guess it, this is the maximal number of input |
31 | bytes needed. It defaults to MIN_NEEDED_INPUT |
32 | MAX_NEEDED_OUTPUT likewise for output bytes. |
33 | |
34 | LOOPFCT name of the function created. If not specified |
35 | the name is `loop' but this prevents the use |
36 | of multiple functions in the same file. |
37 | |
38 | BODY this is supposed to expand to the body of the loop. |
39 | The user must provide this. |
40 | |
41 | EXTRA_LOOP_DECLS extra arguments passed from conversion loop call. |
42 | |
43 | INIT_PARAMS code to define and initialize variables from params. |
44 | UPDATE_PARAMS code to store result in params. |
45 | |
46 | ONEBYTE_BODY body of the specialized conversion function for a |
47 | single byte from the current character set to INTERNAL. |
48 | */ |
49 | |
50 | #include <assert.h> |
51 | #include <endian.h> |
52 | #include <iconv/gconv_int.h> |
53 | #include <stdint.h> |
54 | #include <string.h> |
55 | #include <wchar.h> |
56 | #include <sys/param.h> /* For MIN. */ |
57 | #define __need_size_t |
58 | #include <stddef.h> |
59 | #include <libc-diag.h> |
60 | |
61 | /* We have to provide support for machines which are not able to handled |
62 | unaligned memory accesses. Some of the character encodings have |
63 | representations with a fixed width of 2 or 4 bytes. But if we cannot |
64 | access unaligned memory we still have to read byte-wise. */ |
65 | #undef FCTNAME2 |
66 | #if _STRING_ARCH_unaligned || !defined DEFINE_UNALIGNED |
67 | /* We can handle unaligned memory access. */ |
68 | # define get16(addr) *((const uint16_t *) (addr)) |
69 | # define get32(addr) *((const uint32_t *) (addr)) |
70 | |
71 | /* We need no special support for writing values either. */ |
72 | # define put16(addr, val) *((uint16_t *) (addr)) = (val) |
73 | # define put32(addr, val) *((uint32_t *) (addr)) = (val) |
74 | |
75 | # define FCTNAME2(name) name |
76 | #else |
77 | /* Distinguish between big endian and little endian. */ |
78 | # if __BYTE_ORDER == __LITTLE_ENDIAN |
79 | # define get16(addr) \ |
80 | (((const unsigned char *) (addr))[1] << 8 \ |
81 | | ((const unsigned char *) (addr))[0]) |
82 | # define get32(addr) \ |
83 | (((((const unsigned char *) (addr))[3] << 8 \ |
84 | | ((const unsigned char *) (addr))[2]) << 8 \ |
85 | | ((const unsigned char *) (addr))[1]) << 8 \ |
86 | | ((const unsigned char *) (addr))[0]) |
87 | |
88 | # define put16(addr, val) \ |
89 | ({ uint16_t __val = (val); \ |
90 | ((unsigned char *) (addr))[0] = __val; \ |
91 | ((unsigned char *) (addr))[1] = __val >> 8; \ |
92 | (void) 0; }) |
93 | # define put32(addr, val) \ |
94 | ({ uint32_t __val = (val); \ |
95 | ((unsigned char *) (addr))[0] = __val; \ |
96 | __val >>= 8; \ |
97 | ((unsigned char *) (addr))[1] = __val; \ |
98 | __val >>= 8; \ |
99 | ((unsigned char *) (addr))[2] = __val; \ |
100 | __val >>= 8; \ |
101 | ((unsigned char *) (addr))[3] = __val; \ |
102 | (void) 0; }) |
103 | # else |
104 | # define get16(addr) \ |
105 | (((const unsigned char *) (addr))[0] << 8 \ |
106 | | ((const unsigned char *) (addr))[1]) |
107 | # define get32(addr) \ |
108 | (((((const unsigned char *) (addr))[0] << 8 \ |
109 | | ((const unsigned char *) (addr))[1]) << 8 \ |
110 | | ((const unsigned char *) (addr))[2]) << 8 \ |
111 | | ((const unsigned char *) (addr))[3]) |
112 | |
113 | # define put16(addr, val) \ |
114 | ({ uint16_t __val = (val); \ |
115 | ((unsigned char *) (addr))[1] = __val; \ |
116 | ((unsigned char *) (addr))[0] = __val >> 8; \ |
117 | (void) 0; }) |
118 | # define put32(addr, val) \ |
119 | ({ uint32_t __val = (val); \ |
120 | ((unsigned char *) (addr))[3] = __val; \ |
121 | __val >>= 8; \ |
122 | ((unsigned char *) (addr))[2] = __val; \ |
123 | __val >>= 8; \ |
124 | ((unsigned char *) (addr))[1] = __val; \ |
125 | __val >>= 8; \ |
126 | ((unsigned char *) (addr))[0] = __val; \ |
127 | (void) 0; }) |
128 | # endif |
129 | |
130 | # define FCTNAME2(name) name##_unaligned |
131 | #endif |
132 | #define FCTNAME(name) FCTNAME2(name) |
133 | |
134 | |
135 | /* We need at least one byte for the next round. */ |
136 | #ifndef MIN_NEEDED_INPUT |
137 | # error "MIN_NEEDED_INPUT definition missing" |
138 | #elif MIN_NEEDED_INPUT < 1 |
139 | # error "MIN_NEEDED_INPUT must be >= 1" |
140 | #endif |
141 | |
142 | /* Let's see how many bytes we produce. */ |
143 | #ifndef MAX_NEEDED_INPUT |
144 | # define MAX_NEEDED_INPUT MIN_NEEDED_INPUT |
145 | #endif |
146 | |
147 | /* We produce at least one byte in the next round. */ |
148 | #ifndef MIN_NEEDED_OUTPUT |
149 | # error "MIN_NEEDED_OUTPUT definition missing" |
150 | #elif MIN_NEEDED_OUTPUT < 1 |
151 | # error "MIN_NEEDED_OUTPUT must be >= 1" |
152 | #endif |
153 | |
154 | /* Let's see how many bytes we produce. */ |
155 | #ifndef MAX_NEEDED_OUTPUT |
156 | # define MAX_NEEDED_OUTPUT MIN_NEEDED_OUTPUT |
157 | #endif |
158 | |
159 | /* Default name for the function. */ |
160 | #ifndef LOOPFCT |
161 | # define LOOPFCT loop |
162 | #endif |
163 | |
164 | /* Make sure we have a loop body. */ |
165 | #ifndef BODY |
166 | # error "Definition of BODY missing for function" LOOPFCT |
167 | #endif |
168 | |
169 | |
170 | /* If no arguments have to passed to the loop function define the macro |
171 | as empty. */ |
172 | #ifndef EXTRA_LOOP_DECLS |
173 | # define |
174 | #endif |
175 | |
176 | /* Allow using UPDATE_PARAMS in macros where #ifdef UPDATE_PARAMS test |
177 | isn't possible. */ |
178 | #ifndef UPDATE_PARAMS |
179 | # define UPDATE_PARAMS do { } while (0) |
180 | #endif |
181 | #ifndef REINIT_PARAMS |
182 | # define REINIT_PARAMS do { } while (0) |
183 | #endif |
184 | |
185 | |
186 | /* To make it easier for the writers of the modules, we define a macro |
187 | to test whether we have to ignore errors. */ |
188 | #define ignore_errors_p() \ |
189 | (irreversible != NULL && (flags & __GCONV_IGNORE_ERRORS)) |
190 | |
191 | |
192 | /* Error handling for the FROM_LOOP direction, with ignoring of errors. |
193 | Note that we cannot use the do while (0) trick since `break' and |
194 | `continue' must reach certain points. */ |
195 | #define STANDARD_FROM_LOOP_ERR_HANDLER(Incr) \ |
196 | { \ |
197 | result = __GCONV_ILLEGAL_INPUT; \ |
198 | \ |
199 | if (! ignore_errors_p ()) \ |
200 | break; \ |
201 | \ |
202 | /* We ignore the invalid input byte sequence. */ \ |
203 | inptr += (Incr); \ |
204 | ++*irreversible; \ |
205 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
206 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
207 | continue; \ |
208 | } |
209 | |
210 | /* Error handling for the TO_LOOP direction, with use of transliteration/ |
211 | transcription functions and ignoring of errors. Note that we cannot use |
212 | the do while (0) trick since `break' and `continue' must reach certain |
213 | points. */ |
214 | #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \ |
215 | { \ |
216 | result = __GCONV_ILLEGAL_INPUT; \ |
217 | \ |
218 | if (irreversible == NULL) \ |
219 | /* This means we are in call from __gconv_transliterate. In this \ |
220 | case we are not doing any error recovery outself. */ \ |
221 | break; \ |
222 | \ |
223 | /* If needed, flush any conversion state, so that __gconv_transliterate \ |
224 | starts with current shift state. */ \ |
225 | UPDATE_PARAMS; \ |
226 | \ |
227 | /* First try the transliteration methods. */ \ |
228 | if ((step_data->__flags & __GCONV_TRANSLIT) != 0) \ |
229 | result = __gconv_transliterate \ |
230 | (step, step_data, *inptrp, \ |
231 | &inptr, inend, &outptr, irreversible); \ |
232 | \ |
233 | REINIT_PARAMS; \ |
234 | \ |
235 | /* If any of them recognized the input continue with the loop. */ \ |
236 | if (result != __GCONV_ILLEGAL_INPUT) \ |
237 | { \ |
238 | if (__glibc_unlikely (result == __GCONV_FULL_OUTPUT)) \ |
239 | break; \ |
240 | \ |
241 | continue; \ |
242 | } \ |
243 | \ |
244 | /* Next see whether we have to ignore the error. If not, stop. */ \ |
245 | if (! ignore_errors_p ()) \ |
246 | break; \ |
247 | \ |
248 | /* When we come here it means we ignore the character. */ \ |
249 | ++*irreversible; \ |
250 | inptr += Incr; \ |
251 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
252 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
253 | continue; \ |
254 | } |
255 | |
256 | |
257 | /* With GCC 7 when compiling with -Os for 32-bit s390 the compiler |
258 | warns that the variable 'ch', in the definition of BODY in |
259 | sysdeps/s390/multiarch/8bit-generic.c, may be used uninitialized in |
260 | the call to UNICODE_TAG_HANDLER in that macro. This variable is |
261 | actually always initialized before use, in the prior loop if INDEX |
262 | is nonzero and in the following 'if' if INDEX is zero. That code |
263 | has a comment referencing this diagnostic disabling; updates in one |
264 | place may require updates in the other. */ |
265 | DIAG_PUSH_NEEDS_COMMENT; |
266 | DIAG_IGNORE_Os_NEEDS_COMMENT (7, "-Wmaybe-uninitialized" ); |
267 | /* Handling of Unicode 3.1 TAG characters. Unicode recommends |
268 | "If language codes are not relevant to the particular processing |
269 | operation, then they should be ignored." This macro is usually |
270 | called right before STANDARD_TO_LOOP_ERR_HANDLER (Incr). */ |
271 | #define UNICODE_TAG_HANDLER(Character, Incr) \ |
272 | { \ |
273 | /* TAG characters are those in the range U+E0000..U+E007F. */ \ |
274 | if (((Character) >> 7) == (0xe0000 >> 7)) \ |
275 | { \ |
276 | inptr += Incr; \ |
277 | continue; \ |
278 | } \ |
279 | } |
280 | DIAG_POP_NEEDS_COMMENT; |
281 | |
282 | |
283 | /* The function returns the status, as defined in gconv.h. */ |
284 | static inline int |
285 | __attribute ((always_inline)) |
286 | FCTNAME (LOOPFCT) (struct __gconv_step *step, |
287 | struct __gconv_step_data *step_data, |
288 | const unsigned char **inptrp, const unsigned char *inend, |
289 | unsigned char **outptrp, const unsigned char *outend, |
290 | size_t *irreversible EXTRA_LOOP_DECLS) |
291 | { |
292 | #ifdef LOOP_NEED_STATE |
293 | mbstate_t *state = step_data->__statep; |
294 | #endif |
295 | #ifdef LOOP_NEED_FLAGS |
296 | int flags = step_data->__flags; |
297 | #endif |
298 | #ifdef LOOP_NEED_DATA |
299 | void *data = step->__data; |
300 | #endif |
301 | int result = __GCONV_EMPTY_INPUT; |
302 | const unsigned char *inptr = *inptrp; |
303 | unsigned char *outptr = *outptrp; |
304 | |
305 | #ifdef INIT_PARAMS |
306 | INIT_PARAMS; |
307 | #endif |
308 | |
309 | while (inptr != inend) |
310 | { |
311 | /* `if' cases for MIN_NEEDED_OUTPUT ==/!= 1 is made to help the |
312 | compiler generating better code. They will be optimized away |
313 | since MIN_NEEDED_OUTPUT is always a constant. */ |
314 | if (MIN_NEEDED_INPUT > 1 |
315 | && __builtin_expect (inptr + MIN_NEEDED_INPUT > inend, 0)) |
316 | { |
317 | /* We don't have enough input for another complete input |
318 | character. */ |
319 | result = __GCONV_INCOMPLETE_INPUT; |
320 | break; |
321 | } |
322 | if ((MIN_NEEDED_OUTPUT != 1 |
323 | && __builtin_expect (outptr + MIN_NEEDED_OUTPUT > outend, 0)) |
324 | || (MIN_NEEDED_OUTPUT == 1 |
325 | && __builtin_expect (outptr >= outend, 0))) |
326 | { |
327 | /* Overflow in the output buffer. */ |
328 | result = __GCONV_FULL_OUTPUT; |
329 | break; |
330 | } |
331 | |
332 | /* Here comes the body the user provides. It can stop with |
333 | RESULT set to GCONV_INCOMPLETE_INPUT (if the size of the |
334 | input characters vary in size), GCONV_ILLEGAL_INPUT, or |
335 | GCONV_FULL_OUTPUT (if the output characters vary in size). */ |
336 | BODY |
337 | } |
338 | |
339 | /* Update the pointers pointed to by the parameters. */ |
340 | *inptrp = inptr; |
341 | *outptrp = outptr; |
342 | UPDATE_PARAMS; |
343 | |
344 | return result; |
345 | } |
346 | |
347 | |
348 | /* Include the file a second time to define the function to handle |
349 | unaligned access. */ |
350 | #if !defined DEFINE_UNALIGNED && !_STRING_ARCH_unaligned \ |
351 | && MIN_NEEDED_INPUT != 1 && MAX_NEEDED_INPUT % MIN_NEEDED_INPUT == 0 \ |
352 | && MIN_NEEDED_OUTPUT != 1 && MAX_NEEDED_OUTPUT % MIN_NEEDED_OUTPUT == 0 |
353 | # undef get16 |
354 | # undef get32 |
355 | # undef put16 |
356 | # undef put32 |
357 | # undef unaligned |
358 | |
359 | # define DEFINE_UNALIGNED |
360 | # include "loop.c" |
361 | # undef DEFINE_UNALIGNED |
362 | #else |
363 | # if MAX_NEEDED_INPUT > 1 |
364 | # define SINGLE(fct) SINGLE2 (fct) |
365 | # define SINGLE2(fct) fct##_single |
366 | static inline int |
367 | __attribute ((always_inline)) |
368 | SINGLE(LOOPFCT) (struct __gconv_step *step, |
369 | struct __gconv_step_data *step_data, |
370 | const unsigned char **inptrp, const unsigned char *inend, |
371 | unsigned char **outptrp, unsigned char *outend, |
372 | size_t *irreversible EXTRA_LOOP_DECLS) |
373 | { |
374 | mbstate_t *state = step_data->__statep; |
375 | # ifdef LOOP_NEED_FLAGS |
376 | int flags = step_data->__flags; |
377 | # endif |
378 | # ifdef LOOP_NEED_DATA |
379 | void *data = step->__data; |
380 | # endif |
381 | int result = __GCONV_OK; |
382 | unsigned char bytebuf[MAX_NEEDED_INPUT]; |
383 | const unsigned char *inptr = *inptrp; |
384 | unsigned char *outptr = *outptrp; |
385 | size_t inlen; |
386 | |
387 | # ifdef INIT_PARAMS |
388 | INIT_PARAMS; |
389 | # endif |
390 | |
391 | # ifdef UNPACK_BYTES |
392 | UNPACK_BYTES |
393 | # else |
394 | /* Add the bytes from the state to the input buffer. */ |
395 | assert ((state->__count & 7) <= sizeof (state->__value)); |
396 | for (inlen = 0; inlen < (size_t) (state->__count & 7); ++inlen) |
397 | bytebuf[inlen] = state->__value.__wchb[inlen]; |
398 | # endif |
399 | |
400 | /* Are there enough bytes in the input buffer? */ |
401 | if (MIN_NEEDED_INPUT > 1 |
402 | && __builtin_expect (inptr + (MIN_NEEDED_INPUT - inlen) > inend, 0)) |
403 | { |
404 | *inptrp = inend; |
405 | # ifdef STORE_REST |
406 | |
407 | /* Building with -O3 GCC emits a `array subscript is above array |
408 | bounds' warning. GCC BZ #64739 has been opened for this. */ |
409 | DIAG_PUSH_NEEDS_COMMENT; |
410 | DIAG_IGNORE_NEEDS_COMMENT (4.9, "-Warray-bounds" ); |
411 | while (inptr < inend) |
412 | bytebuf[inlen++] = *inptr++; |
413 | DIAG_POP_NEEDS_COMMENT; |
414 | |
415 | inptr = bytebuf; |
416 | inptrp = &inptr; |
417 | inend = &bytebuf[inlen]; |
418 | |
419 | STORE_REST |
420 | # else |
421 | /* We don't have enough input for another complete input |
422 | character. */ |
423 | size_t inlen_after = inlen + (inend - inptr); |
424 | assert (inlen_after <= sizeof (state->__value.__wchb)); |
425 | for (; inlen < inlen_after; inlen++) |
426 | state->__value.__wchb[inlen] = *inptr++; |
427 | # endif |
428 | |
429 | return __GCONV_INCOMPLETE_INPUT; |
430 | } |
431 | |
432 | /* Enough space in output buffer. */ |
433 | if ((MIN_NEEDED_OUTPUT != 1 && outptr + MIN_NEEDED_OUTPUT > outend) |
434 | || (MIN_NEEDED_OUTPUT == 1 && outptr >= outend)) |
435 | /* Overflow in the output buffer. */ |
436 | return __GCONV_FULL_OUTPUT; |
437 | |
438 | /* Now add characters from the normal input buffer. */ |
439 | do |
440 | bytebuf[inlen++] = *inptr++; |
441 | while (inlen < MAX_NEEDED_INPUT && inptr < inend); |
442 | |
443 | inptr = bytebuf; |
444 | inend = &bytebuf[inlen]; |
445 | |
446 | do |
447 | { |
448 | BODY |
449 | } |
450 | while (0); |
451 | |
452 | /* Now we either have produced an output character and consumed all the |
453 | bytes from the state and at least one more, or the character is still |
454 | incomplete, or we have some other error (like illegal input character, |
455 | no space in output buffer). */ |
456 | if (__glibc_likely (inptr != bytebuf)) |
457 | { |
458 | /* We found a new character. */ |
459 | assert (inptr - bytebuf > (state->__count & 7)); |
460 | |
461 | *inptrp += inptr - bytebuf - (state->__count & 7); |
462 | *outptrp = outptr; |
463 | |
464 | result = __GCONV_OK; |
465 | |
466 | /* Clear the state buffer. */ |
467 | # ifdef CLEAR_STATE |
468 | CLEAR_STATE; |
469 | # else |
470 | state->__count &= ~7; |
471 | # endif |
472 | } |
473 | else if (result == __GCONV_INCOMPLETE_INPUT) |
474 | { |
475 | /* This can only happen if we have less than MAX_NEEDED_INPUT bytes |
476 | available. */ |
477 | assert (inend != &bytebuf[MAX_NEEDED_INPUT]); |
478 | |
479 | *inptrp += inend - bytebuf - (state->__count & 7); |
480 | # ifdef STORE_REST |
481 | inptrp = &inptr; |
482 | |
483 | STORE_REST |
484 | # else |
485 | /* We don't have enough input for another complete input |
486 | character. */ |
487 | assert (inend - inptr > (state->__count & ~7)); |
488 | assert (inend - inptr <= sizeof (state->__value.__wchb)); |
489 | state->__count = (state->__count & ~7) | (inend - inptr); |
490 | for (inlen = 0; inlen < inend - inptr; inlen++) |
491 | state->__value.__wchb[inlen] = inptr[inlen]; |
492 | inptr = inend; |
493 | # endif |
494 | } |
495 | |
496 | return result; |
497 | } |
498 | # undef SINGLE |
499 | # undef SINGLE2 |
500 | # endif |
501 | |
502 | |
503 | # ifdef ONEBYTE_BODY |
504 | /* Define the shortcut function for btowc. */ |
505 | static wint_t |
506 | gconv_btowc (struct __gconv_step *step, unsigned char c) |
507 | ONEBYTE_BODY |
508 | # define FROM_ONEBYTE gconv_btowc |
509 | # endif |
510 | |
511 | #endif |
512 | |
513 | /* We remove the macro definitions so that we can include this file again |
514 | for the definition of another function. */ |
515 | #undef MIN_NEEDED_INPUT |
516 | #undef MAX_NEEDED_INPUT |
517 | #undef MIN_NEEDED_OUTPUT |
518 | #undef MAX_NEEDED_OUTPUT |
519 | #undef LOOPFCT |
520 | #undef BODY |
521 | #undef LOOPFCT |
522 | #undef EXTRA_LOOP_DECLS |
523 | #undef INIT_PARAMS |
524 | #undef UPDATE_PARAMS |
525 | #undef REINIT_PARAMS |
526 | #undef ONEBYTE_BODY |
527 | #undef UNPACK_BYTES |
528 | #undef CLEAR_STATE |
529 | #undef LOOP_NEED_STATE |
530 | #undef LOOP_NEED_FLAGS |
531 | #undef LOOP_NEED_DATA |
532 | #undef get16 |
533 | #undef get32 |
534 | #undef put16 |
535 | #undef put32 |
536 | #undef unaligned |
537 | |