1 | /* Conversion loop frame work. |
2 | Copyright (C) 1998-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* This file provides a frame for the reader loop in all conversion modules. |
20 | The actual code must (of course) be provided in the actual module source |
21 | code but certain actions can be written down generically, with some |
22 | customization options which are these: |
23 | |
24 | MIN_NEEDED_INPUT minimal number of input bytes needed for the next |
25 | conversion. |
26 | MIN_NEEDED_OUTPUT minimal number of bytes produced by the next round |
27 | of conversion. |
28 | |
29 | MAX_NEEDED_INPUT you guess it, this is the maximal number of input |
30 | bytes needed. It defaults to MIN_NEEDED_INPUT |
31 | MAX_NEEDED_OUTPUT likewise for output bytes. |
32 | |
33 | LOOPFCT name of the function created. If not specified |
34 | the name is `loop' but this prevents the use |
35 | of multiple functions in the same file. |
36 | |
37 | BODY this is supposed to expand to the body of the loop. |
38 | The user must provide this. |
39 | |
40 | EXTRA_LOOP_DECLS extra arguments passed from conversion loop call. |
41 | |
42 | INIT_PARAMS code to define and initialize variables from params. |
43 | UPDATE_PARAMS code to store result in params. |
44 | |
45 | ONEBYTE_BODY body of the specialized conversion function for a |
46 | single byte from the current character set to INTERNAL. |
47 | */ |
48 | |
49 | #include <assert.h> |
50 | #include <endian.h> |
51 | #include <iconv/gconv_int.h> |
52 | #include <stdint.h> |
53 | #include <string.h> |
54 | #include <wchar.h> |
55 | #include <sys/param.h> /* For MIN. */ |
56 | #define __need_size_t |
57 | #include <stddef.h> |
58 | #include <libc-diag.h> |
59 | |
60 | #undef FCTNAME2 |
61 | #define FCTNAME(name) name |
62 | |
63 | |
64 | /* We need at least one byte for the next round. */ |
65 | #ifndef MIN_NEEDED_INPUT |
66 | # error "MIN_NEEDED_INPUT definition missing" |
67 | #elif MIN_NEEDED_INPUT < 1 |
68 | # error "MIN_NEEDED_INPUT must be >= 1" |
69 | #endif |
70 | |
71 | /* Let's see how many bytes we produce. */ |
72 | #ifndef MAX_NEEDED_INPUT |
73 | # define MAX_NEEDED_INPUT MIN_NEEDED_INPUT |
74 | #endif |
75 | |
76 | /* We produce at least one byte in the next round. */ |
77 | #ifndef MIN_NEEDED_OUTPUT |
78 | # error "MIN_NEEDED_OUTPUT definition missing" |
79 | #elif MIN_NEEDED_OUTPUT < 1 |
80 | # error "MIN_NEEDED_OUTPUT must be >= 1" |
81 | #endif |
82 | |
83 | /* Let's see how many bytes we produce. */ |
84 | #ifndef MAX_NEEDED_OUTPUT |
85 | # define MAX_NEEDED_OUTPUT MIN_NEEDED_OUTPUT |
86 | #endif |
87 | |
88 | /* Default name for the function. */ |
89 | #ifndef LOOPFCT |
90 | # define LOOPFCT loop |
91 | #endif |
92 | |
93 | /* Make sure we have a loop body. */ |
94 | #ifndef BODY |
95 | # error "Definition of BODY missing for function" LOOPFCT |
96 | #endif |
97 | |
98 | |
99 | /* If no arguments have to passed to the loop function define the macro |
100 | as empty. */ |
101 | #ifndef EXTRA_LOOP_DECLS |
102 | # define |
103 | #endif |
104 | |
105 | /* Allow using UPDATE_PARAMS in macros where #ifdef UPDATE_PARAMS test |
106 | isn't possible. */ |
107 | #ifndef UPDATE_PARAMS |
108 | # define UPDATE_PARAMS do { } while (0) |
109 | #endif |
110 | #ifndef REINIT_PARAMS |
111 | # define REINIT_PARAMS do { } while (0) |
112 | #endif |
113 | |
114 | |
115 | /* To make it easier for the writers of the modules, we define a macro |
116 | to test whether we have to ignore errors. */ |
117 | #define ignore_errors_p() \ |
118 | (irreversible != NULL && (flags & __GCONV_IGNORE_ERRORS)) |
119 | |
120 | |
121 | /* Error handling for the FROM_LOOP direction, with ignoring of errors. |
122 | Note that we cannot use the do while (0) trick since `break' and |
123 | `continue' must reach certain points. */ |
124 | #define STANDARD_FROM_LOOP_ERR_HANDLER(Incr) \ |
125 | { \ |
126 | result = __GCONV_ILLEGAL_INPUT; \ |
127 | \ |
128 | if (! ignore_errors_p ()) \ |
129 | break; \ |
130 | \ |
131 | /* We ignore the invalid input byte sequence. */ \ |
132 | inptr += (Incr); \ |
133 | ++*irreversible; \ |
134 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
135 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
136 | continue; \ |
137 | } |
138 | |
139 | /* Error handling for the TO_LOOP direction, with use of transliteration/ |
140 | transcription functions and ignoring of errors. Note that we cannot use |
141 | the do while (0) trick since `break' and `continue' must reach certain |
142 | points. */ |
143 | #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \ |
144 | { \ |
145 | result = __GCONV_ILLEGAL_INPUT; \ |
146 | \ |
147 | if (irreversible == NULL) \ |
148 | /* This means we are in call from __gconv_transliterate. In this \ |
149 | case we are not doing any error recovery outself. */ \ |
150 | break; \ |
151 | \ |
152 | /* If needed, flush any conversion state, so that __gconv_transliterate \ |
153 | starts with current shift state. */ \ |
154 | UPDATE_PARAMS; \ |
155 | \ |
156 | /* First try the transliteration methods. */ \ |
157 | if ((step_data->__flags & __GCONV_TRANSLIT) != 0) \ |
158 | result = __gconv_transliterate \ |
159 | (step, step_data, *inptrp, \ |
160 | &inptr, inend, &outptr, irreversible); \ |
161 | \ |
162 | REINIT_PARAMS; \ |
163 | \ |
164 | /* If any of them recognized the input continue with the loop. */ \ |
165 | if (result != __GCONV_ILLEGAL_INPUT) \ |
166 | { \ |
167 | if (__glibc_unlikely (result == __GCONV_FULL_OUTPUT)) \ |
168 | break; \ |
169 | \ |
170 | continue; \ |
171 | } \ |
172 | \ |
173 | /* Next see whether we have to ignore the error. If not, stop. */ \ |
174 | if (! ignore_errors_p ()) \ |
175 | break; \ |
176 | \ |
177 | /* When we come here it means we ignore the character. */ \ |
178 | ++*irreversible; \ |
179 | inptr += Incr; \ |
180 | /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \ |
181 | that "iconv -c" must give the same exitcode as "iconv". */ \ |
182 | continue; \ |
183 | } |
184 | |
185 | |
186 | /* With GCC 7 when compiling with -Os for 32-bit s390 the compiler |
187 | warns that the variable 'ch', in the definition of BODY in |
188 | sysdeps/s390/multiarch/8bit-generic.c, may be used uninitialized in |
189 | the call to UNICODE_TAG_HANDLER in that macro. This variable is |
190 | actually always initialized before use, in the prior loop if INDEX |
191 | is nonzero and in the following 'if' if INDEX is zero. That code |
192 | has a comment referencing this diagnostic disabling; updates in one |
193 | place may require updates in the other. */ |
194 | DIAG_PUSH_NEEDS_COMMENT; |
195 | DIAG_IGNORE_Os_NEEDS_COMMENT (7, "-Wmaybe-uninitialized" ); |
196 | /* Handling of Unicode 3.1 TAG characters. Unicode recommends |
197 | "If language codes are not relevant to the particular processing |
198 | operation, then they should be ignored." This macro is usually |
199 | called right before STANDARD_TO_LOOP_ERR_HANDLER (Incr). */ |
200 | #define UNICODE_TAG_HANDLER(Character, Incr) \ |
201 | { \ |
202 | /* TAG characters are those in the range U+E0000..U+E007F. */ \ |
203 | if (((Character) >> 7) == (0xe0000 >> 7)) \ |
204 | { \ |
205 | inptr += Incr; \ |
206 | continue; \ |
207 | } \ |
208 | } |
209 | DIAG_POP_NEEDS_COMMENT; |
210 | |
211 | |
212 | /* The function returns the status, as defined in gconv.h. */ |
213 | static inline int |
214 | __attribute ((always_inline)) |
215 | FCTNAME (LOOPFCT) (struct __gconv_step *step, |
216 | struct __gconv_step_data *step_data, |
217 | const unsigned char **inptrp, const unsigned char *inend, |
218 | unsigned char **outptrp, const unsigned char *outend, |
219 | size_t *irreversible EXTRA_LOOP_DECLS) |
220 | { |
221 | #ifdef LOOP_NEED_STATE |
222 | mbstate_t *state = step_data->__statep; |
223 | #endif |
224 | #ifdef LOOP_NEED_FLAGS |
225 | int flags = step_data->__flags; |
226 | #endif |
227 | #ifdef LOOP_NEED_DATA |
228 | void *data = step->__data; |
229 | #endif |
230 | int result = __GCONV_EMPTY_INPUT; |
231 | const unsigned char *inptr = *inptrp; |
232 | unsigned char *outptr = *outptrp; |
233 | |
234 | #ifdef INIT_PARAMS |
235 | INIT_PARAMS; |
236 | #endif |
237 | |
238 | while (inptr != inend) |
239 | { |
240 | /* `if' cases for MIN_NEEDED_OUTPUT ==/!= 1 is made to help the |
241 | compiler generating better code. They will be optimized away |
242 | since MIN_NEEDED_OUTPUT is always a constant. */ |
243 | if (MIN_NEEDED_INPUT > 1 |
244 | && __builtin_expect (inptr + MIN_NEEDED_INPUT > inend, 0)) |
245 | { |
246 | /* We don't have enough input for another complete input |
247 | character. */ |
248 | result = __GCONV_INCOMPLETE_INPUT; |
249 | break; |
250 | } |
251 | if ((MIN_NEEDED_OUTPUT != 1 |
252 | && __builtin_expect (outptr + MIN_NEEDED_OUTPUT > outend, 0)) |
253 | || (MIN_NEEDED_OUTPUT == 1 |
254 | && __builtin_expect (outptr >= outend, 0))) |
255 | { |
256 | /* Overflow in the output buffer. */ |
257 | result = __GCONV_FULL_OUTPUT; |
258 | break; |
259 | } |
260 | |
261 | /* Here comes the body the user provides. It can stop with |
262 | RESULT set to GCONV_INCOMPLETE_INPUT (if the size of the |
263 | input characters vary in size), GCONV_ILLEGAL_INPUT, or |
264 | GCONV_FULL_OUTPUT (if the output characters vary in size). */ |
265 | BODY |
266 | } |
267 | |
268 | /* Update the pointers pointed to by the parameters. */ |
269 | *inptrp = inptr; |
270 | *outptrp = outptr; |
271 | UPDATE_PARAMS; |
272 | |
273 | return result; |
274 | } |
275 | |
276 | |
277 | #if MAX_NEEDED_INPUT > 1 |
278 | # define SINGLE(fct) SINGLE2 (fct) |
279 | # define SINGLE2(fct) fct##_single |
280 | static inline int |
281 | __attribute ((always_inline)) |
282 | SINGLE(LOOPFCT) (struct __gconv_step *step, |
283 | struct __gconv_step_data *step_data, |
284 | const unsigned char **inptrp, const unsigned char *inend, |
285 | unsigned char **outptrp, unsigned char *outend, |
286 | size_t *irreversible EXTRA_LOOP_DECLS) |
287 | { |
288 | mbstate_t *state = step_data->__statep; |
289 | # ifdef LOOP_NEED_FLAGS |
290 | int flags = step_data->__flags; |
291 | # endif |
292 | # ifdef LOOP_NEED_DATA |
293 | void *data = step->__data; |
294 | # endif |
295 | int result = __GCONV_OK; |
296 | unsigned char bytebuf[MAX_NEEDED_INPUT]; |
297 | const unsigned char *inptr = *inptrp; |
298 | unsigned char *outptr = *outptrp; |
299 | size_t inlen; |
300 | |
301 | # ifdef INIT_PARAMS |
302 | INIT_PARAMS; |
303 | # endif |
304 | |
305 | # ifdef UNPACK_BYTES |
306 | UNPACK_BYTES |
307 | # else |
308 | /* Add the bytes from the state to the input buffer. */ |
309 | assert ((state->__count & 7) <= sizeof (state->__value)); |
310 | for (inlen = 0; inlen < (size_t) (state->__count & 7); ++inlen) |
311 | bytebuf[inlen] = state->__value.__wchb[inlen]; |
312 | # endif |
313 | |
314 | /* Are there enough bytes in the input buffer? */ |
315 | if (MIN_NEEDED_INPUT > 1 |
316 | && __builtin_expect (inptr + (MIN_NEEDED_INPUT - inlen) > inend, 0)) |
317 | { |
318 | *inptrp = inend; |
319 | # ifdef STORE_REST |
320 | |
321 | /* Building with -O3 GCC emits a `array subscript is above array |
322 | bounds' warning. GCC BZ #64739 has been opened for this. */ |
323 | DIAG_PUSH_NEEDS_COMMENT; |
324 | DIAG_IGNORE_NEEDS_COMMENT (4.9, "-Warray-bounds" ); |
325 | while (inptr < inend) |
326 | bytebuf[inlen++] = *inptr++; |
327 | DIAG_POP_NEEDS_COMMENT; |
328 | |
329 | inptr = bytebuf; |
330 | inptrp = &inptr; |
331 | inend = &bytebuf[inlen]; |
332 | |
333 | STORE_REST |
334 | # else |
335 | /* We don't have enough input for another complete input |
336 | character. */ |
337 | size_t inlen_after = inlen + (inend - inptr); |
338 | assert (inlen_after <= sizeof (state->__value.__wchb)); |
339 | for (; inlen < inlen_after; inlen++) |
340 | state->__value.__wchb[inlen] = *inptr++; |
341 | # endif |
342 | |
343 | return __GCONV_INCOMPLETE_INPUT; |
344 | } |
345 | |
346 | /* Enough space in output buffer. */ |
347 | if ((MIN_NEEDED_OUTPUT != 1 && outptr + MIN_NEEDED_OUTPUT > outend) |
348 | || (MIN_NEEDED_OUTPUT == 1 && outptr >= outend)) |
349 | /* Overflow in the output buffer. */ |
350 | return __GCONV_FULL_OUTPUT; |
351 | |
352 | /* Now add characters from the normal input buffer. */ |
353 | if (inlen >= MAX_NEEDED_INPUT || inptr >= inend) |
354 | /* Avoid a -Wstringop-overflow= warning when this loop is |
355 | unrolled. The compiler cannot otherwise see that this is |
356 | unreachable because it depends on (state->__count & 7) not |
357 | being too large after a previous conversion step. |
358 | Starting with GCC 12, we also have mark the inptr >= inend |
359 | case as unreachable to omit the warning. Note that this SINGLE |
360 | function is only used to implement the mb*towc*() or wc*tomb*() |
361 | functions. Those functions use inptr and inend pointing to a |
362 | variable on stack, compute the inend pointer or explicitly check |
363 | the arguments which always leads to inptr < inend. */ |
364 | __builtin_unreachable (); |
365 | do |
366 | bytebuf[inlen++] = *inptr++; |
367 | while (inlen < MAX_NEEDED_INPUT && inptr < inend); |
368 | |
369 | inptr = bytebuf; |
370 | inend = &bytebuf[inlen]; |
371 | |
372 | do |
373 | { |
374 | BODY |
375 | } |
376 | while (0); |
377 | |
378 | /* Now we either have produced an output character and consumed all the |
379 | bytes from the state and at least one more, or the character is still |
380 | incomplete, or we have some other error (like illegal input character, |
381 | no space in output buffer). */ |
382 | if (__glibc_likely (inptr != bytebuf)) |
383 | { |
384 | /* We found a new character. */ |
385 | assert (inptr - bytebuf > (state->__count & 7)); |
386 | |
387 | *inptrp += inptr - bytebuf - (state->__count & 7); |
388 | *outptrp = outptr; |
389 | |
390 | result = __GCONV_OK; |
391 | |
392 | /* Clear the state buffer. */ |
393 | # ifdef CLEAR_STATE |
394 | CLEAR_STATE; |
395 | # else |
396 | state->__count &= ~7; |
397 | # endif |
398 | } |
399 | else if (result == __GCONV_INCOMPLETE_INPUT) |
400 | { |
401 | /* This can only happen if we have less than MAX_NEEDED_INPUT bytes |
402 | available. */ |
403 | assert (inend != &bytebuf[MAX_NEEDED_INPUT]); |
404 | |
405 | *inptrp += inend - bytebuf - (state->__count & 7); |
406 | # ifdef STORE_REST |
407 | inptrp = &inptr; |
408 | |
409 | STORE_REST |
410 | # else |
411 | /* We don't have enough input for another complete input |
412 | character. */ |
413 | assert (inend - inptr > (state->__count & ~7)); |
414 | assert (inend - inptr <= sizeof (state->__value.__wchb)); |
415 | state->__count = (state->__count & ~7) | (inend - inptr); |
416 | for (inlen = 0; inlen < inend - inptr; inlen++) |
417 | state->__value.__wchb[inlen] = inptr[inlen]; |
418 | inptr = inend; |
419 | # endif |
420 | } |
421 | |
422 | return result; |
423 | } |
424 | # undef SINGLE |
425 | # undef SINGLE2 |
426 | |
427 | |
428 | # ifdef ONEBYTE_BODY |
429 | /* Define the shortcut function for btowc. */ |
430 | static wint_t |
431 | gconv_btowc (struct __gconv_step *step, unsigned char c) |
432 | ONEBYTE_BODY |
433 | # define FROM_ONEBYTE gconv_btowc |
434 | # endif |
435 | |
436 | #endif |
437 | |
438 | /* We remove the macro definitions so that we can include this file again |
439 | for the definition of another function. */ |
440 | #undef MIN_NEEDED_INPUT |
441 | #undef MAX_NEEDED_INPUT |
442 | #undef MIN_NEEDED_OUTPUT |
443 | #undef MAX_NEEDED_OUTPUT |
444 | #undef LOOPFCT |
445 | #undef BODY |
446 | #undef LOOPFCT |
447 | #undef EXTRA_LOOP_DECLS |
448 | #undef INIT_PARAMS |
449 | #undef UPDATE_PARAMS |
450 | #undef REINIT_PARAMS |
451 | #undef ONEBYTE_BODY |
452 | #undef UNPACK_BYTES |
453 | #undef CLEAR_STATE |
454 | #undef LOOP_NEED_STATE |
455 | #undef LOOP_NEED_FLAGS |
456 | #undef LOOP_NEED_DATA |
457 | |