1 | /* Conversion module for UTF-7. |
2 | Copyright (C) 2000-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | Contributed by Bruno Haible <haible@clisp.cons.org>, 2000. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | /* UTF-7 is a legacy encoding used for transmitting Unicode within the |
21 | ASCII character set, used primarily by mail agents. New programs |
22 | are encouraged to use UTF-8 instead. |
23 | |
24 | UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The |
25 | original Base64 encoding is defined in RFC 2045. */ |
26 | |
27 | #include <dlfcn.h> |
28 | #include <gconv.h> |
29 | #include <stdint.h> |
30 | #include <stdlib.h> |
31 | |
32 | |
33 | /* Define this to 1 if you want the so-called "optional direct" characters |
34 | ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } |
35 | to be encoded. Define to 0 if you want them to be passed straight |
36 | through, like the so-called "direct" characters. |
37 | We set this to 1 because it's safer. |
38 | */ |
39 | #define UTF7_ENCODE_OPTIONAL_CHARS 1 |
40 | |
41 | |
42 | /* The set of "direct characters": |
43 | A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr |
44 | */ |
45 | |
46 | static const unsigned char direct_tab[128 / 8] = |
47 | { |
48 | 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87, |
49 | 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 |
50 | }; |
51 | |
52 | static int |
53 | isdirect (uint32_t ch) |
54 | { |
55 | return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1)); |
56 | } |
57 | |
58 | |
59 | /* The set of "direct and optional direct characters": |
60 | A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr |
61 | ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } |
62 | */ |
63 | |
64 | static const unsigned char xdirect_tab[128 / 8] = |
65 | { |
66 | 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff, |
67 | 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f |
68 | }; |
69 | |
70 | static int |
71 | isxdirect (uint32_t ch) |
72 | { |
73 | return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1)); |
74 | } |
75 | |
76 | |
77 | /* The set of "extended base64 characters": |
78 | A-Z a-z 0-9 + / - |
79 | */ |
80 | |
81 | static const unsigned char xbase64_tab[128 / 8] = |
82 | { |
83 | 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03, |
84 | 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 |
85 | }; |
86 | |
87 | static int |
88 | isxbase64 (uint32_t ch) |
89 | { |
90 | return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1)); |
91 | } |
92 | |
93 | |
94 | /* Converts a value in the range 0..63 to a base64 encoded char. */ |
95 | static unsigned char |
96 | base64 (unsigned int i) |
97 | { |
98 | if (i < 26) |
99 | return i + 'A'; |
100 | else if (i < 52) |
101 | return i - 26 + 'a'; |
102 | else if (i < 62) |
103 | return i - 52 + '0'; |
104 | else if (i == 62) |
105 | return '+'; |
106 | else if (i == 63) |
107 | return '/'; |
108 | else |
109 | abort (); |
110 | } |
111 | |
112 | |
113 | /* Definitions used in the body of the `gconv' function. */ |
114 | #define CHARSET_NAME "UTF-7//" |
115 | #define DEFINE_INIT 1 |
116 | #define DEFINE_FINI 1 |
117 | #define FROM_LOOP from_utf7_loop |
118 | #define TO_LOOP to_utf7_loop |
119 | #define MIN_NEEDED_FROM 1 |
120 | #define MAX_NEEDED_FROM 6 |
121 | #define MIN_NEEDED_TO 4 |
122 | #define MAX_NEEDED_TO 4 |
123 | #define ONE_DIRECTION 0 |
124 | #define PREPARE_LOOP \ |
125 | mbstate_t saved_state; \ |
126 | mbstate_t *statep = data->__statep; |
127 | #define , statep |
128 | |
129 | |
130 | /* Since we might have to reset input pointer we must be able to save |
131 | and restore the state. */ |
132 | #define SAVE_RESET_STATE(Save) \ |
133 | if (Save) \ |
134 | saved_state = *statep; \ |
135 | else \ |
136 | *statep = saved_state |
137 | |
138 | |
139 | /* First define the conversion function from UTF-7 to UCS4. |
140 | The state is structured as follows: |
141 | __count bit 2..0: zero |
142 | __count bit 8..3: shift |
143 | __wch: data |
144 | Precise meaning: |
145 | shift data |
146 | 0 -- not inside base64 encoding |
147 | 1..32 XX..XX00..00 inside base64, (32 - shift) bits pending |
148 | This state layout is simpler than relying on STORE_REST/UNPACK_BYTES. |
149 | |
150 | When shift = 0, __wch needs to store at most one lookahead byte (see |
151 | __GCONV_INCOMPLETE_INPUT below). |
152 | */ |
153 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
154 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM |
155 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
156 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO |
157 | #define LOOPFCT FROM_LOOP |
158 | #define BODY \ |
159 | { \ |
160 | uint_fast8_t ch = *inptr; \ |
161 | \ |
162 | if ((statep->__count >> 3) == 0) \ |
163 | { \ |
164 | /* base64 encoding inactive. */ \ |
165 | if (isxdirect (ch)) \ |
166 | { \ |
167 | inptr++; \ |
168 | put32 (outptr, ch); \ |
169 | outptr += 4; \ |
170 | } \ |
171 | else if (__glibc_likely (ch == '+')) \ |
172 | { \ |
173 | if (__glibc_unlikely (inptr + 2 > inend)) \ |
174 | { \ |
175 | /* Not enough input available. */ \ |
176 | result = __GCONV_INCOMPLETE_INPUT; \ |
177 | break; \ |
178 | } \ |
179 | if (inptr[1] == '-') \ |
180 | { \ |
181 | inptr += 2; \ |
182 | put32 (outptr, ch); \ |
183 | outptr += 4; \ |
184 | } \ |
185 | else \ |
186 | { \ |
187 | /* Switch into base64 mode. */ \ |
188 | inptr++; \ |
189 | statep->__count = (32 << 3); \ |
190 | statep->__value.__wch = 0; \ |
191 | } \ |
192 | } \ |
193 | else \ |
194 | { \ |
195 | /* The input is invalid. */ \ |
196 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
197 | } \ |
198 | } \ |
199 | else \ |
200 | { \ |
201 | /* base64 encoding active. */ \ |
202 | uint32_t i; \ |
203 | int shift; \ |
204 | \ |
205 | if (ch >= 'A' && ch <= 'Z') \ |
206 | i = ch - 'A'; \ |
207 | else if (ch >= 'a' && ch <= 'z') \ |
208 | i = ch - 'a' + 26; \ |
209 | else if (ch >= '0' && ch <= '9') \ |
210 | i = ch - '0' + 52; \ |
211 | else if (ch == '+') \ |
212 | i = 62; \ |
213 | else if (ch == '/') \ |
214 | i = 63; \ |
215 | else \ |
216 | { \ |
217 | /* Terminate base64 encoding. */ \ |
218 | \ |
219 | /* If accumulated data is nonzero, the input is invalid. */ \ |
220 | /* Also, partial UTF-16 characters are invalid. */ \ |
221 | if (__builtin_expect (statep->__value.__wch != 0, 0) \ |
222 | || __builtin_expect ((statep->__count >> 3) <= 26, 0)) \ |
223 | { \ |
224 | STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1)); \ |
225 | } \ |
226 | \ |
227 | if (ch == '-') \ |
228 | inptr++; \ |
229 | \ |
230 | statep->__count = 0; \ |
231 | continue; \ |
232 | } \ |
233 | \ |
234 | /* Concatenate the base64 integer i to the accumulator. */ \ |
235 | shift = (statep->__count >> 3); \ |
236 | if (shift > 6) \ |
237 | { \ |
238 | uint32_t wch; \ |
239 | \ |
240 | shift -= 6; \ |
241 | wch = statep->__value.__wch | (i << shift); \ |
242 | \ |
243 | if (shift <= 16 && shift > 10) \ |
244 | { \ |
245 | /* An UTF-16 character has just been completed. */ \ |
246 | uint32_t wc1 = wch >> 16; \ |
247 | \ |
248 | /* UTF-16: When we see a High Surrogate, we must also decode \ |
249 | the following Low Surrogate. */ \ |
250 | if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) \ |
251 | { \ |
252 | wch = wch << 16; \ |
253 | shift += 16; \ |
254 | put32 (outptr, wc1); \ |
255 | outptr += 4; \ |
256 | } \ |
257 | } \ |
258 | else if (shift <= 10 && shift > 4) \ |
259 | { \ |
260 | /* After a High Surrogate, verify that the next 16 bit \ |
261 | indeed form a Low Surrogate. */ \ |
262 | uint32_t wc2 = wch & 0xffff; \ |
263 | \ |
264 | if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \ |
265 | { \ |
266 | STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\ |
267 | } \ |
268 | } \ |
269 | \ |
270 | statep->__value.__wch = wch; \ |
271 | } \ |
272 | else \ |
273 | { \ |
274 | /* An UTF-16 surrogate pair has just been completed. */ \ |
275 | uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16; \ |
276 | uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff) \ |
277 | | (i >> (6 - shift)); \ |
278 | \ |
279 | statep->__value.__wch = (i << shift) << 26; \ |
280 | shift += 26; \ |
281 | \ |
282 | assert (wc1 >= 0xd800 && wc1 < 0xdc00); \ |
283 | assert (wc2 >= 0xdc00 && wc2 < 0xe000); \ |
284 | put32 (outptr, \ |
285 | 0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00)); \ |
286 | outptr += 4; \ |
287 | } \ |
288 | \ |
289 | statep->__count = shift << 3; \ |
290 | \ |
291 | /* Now that we digested the input increment the input pointer. */ \ |
292 | inptr++; \ |
293 | } \ |
294 | } |
295 | #define LOOP_NEED_FLAGS |
296 | #define , mbstate_t *statep |
297 | #include <iconv/loop.c> |
298 | |
299 | |
300 | /* Next, define the conversion from UCS4 to UTF-7. |
301 | The state is structured as follows: |
302 | __count bit 2..0: zero |
303 | __count bit 4..3: shift |
304 | __count bit 8..5: data |
305 | Precise meaning: |
306 | shift data |
307 | 0 0 not inside base64 encoding |
308 | 1 0 inside base64, no pending bits |
309 | 2 XX00 inside base64, 2 bits known for next byte |
310 | 3 XXXX inside base64, 4 bits known for next byte |
311 | |
312 | __count bit 2..0 and __wch are always zero, because this direction |
313 | never returns __GCONV_INCOMPLETE_INPUT. |
314 | */ |
315 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO |
316 | #define MAX_NEEDED_INPUT MAX_NEEDED_TO |
317 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM |
318 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM |
319 | #define LOOPFCT TO_LOOP |
320 | #define BODY \ |
321 | { \ |
322 | uint32_t ch = get32 (inptr); \ |
323 | \ |
324 | if ((statep->__count & 0x18) == 0) \ |
325 | { \ |
326 | /* base64 encoding inactive */ \ |
327 | if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ |
328 | { \ |
329 | *outptr++ = (unsigned char) ch; \ |
330 | } \ |
331 | else \ |
332 | { \ |
333 | size_t count; \ |
334 | \ |
335 | if (ch == '+') \ |
336 | count = 2; \ |
337 | else if (ch < 0x10000) \ |
338 | count = 3; \ |
339 | else if (ch < 0x110000) \ |
340 | count = 6; \ |
341 | else \ |
342 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
343 | \ |
344 | if (__glibc_unlikely (outptr + count > outend)) \ |
345 | { \ |
346 | result = __GCONV_FULL_OUTPUT; \ |
347 | break; \ |
348 | } \ |
349 | \ |
350 | *outptr++ = '+'; \ |
351 | if (ch == '+') \ |
352 | *outptr++ = '-'; \ |
353 | else if (ch < 0x10000) \ |
354 | { \ |
355 | *outptr++ = base64 (ch >> 10); \ |
356 | *outptr++ = base64 ((ch >> 4) & 0x3f); \ |
357 | statep->__count = ((ch & 15) << 5) | (3 << 3); \ |
358 | } \ |
359 | else if (ch < 0x110000) \ |
360 | { \ |
361 | uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ |
362 | uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ |
363 | \ |
364 | ch = (ch1 << 16) | ch2; \ |
365 | *outptr++ = base64 (ch >> 26); \ |
366 | *outptr++ = base64 ((ch >> 20) & 0x3f); \ |
367 | *outptr++ = base64 ((ch >> 14) & 0x3f); \ |
368 | *outptr++ = base64 ((ch >> 8) & 0x3f); \ |
369 | *outptr++ = base64 ((ch >> 2) & 0x3f); \ |
370 | statep->__count = ((ch & 3) << 7) | (2 << 3); \ |
371 | } \ |
372 | else \ |
373 | abort (); \ |
374 | } \ |
375 | } \ |
376 | else \ |
377 | { \ |
378 | /* base64 encoding active */ \ |
379 | if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ |
380 | { \ |
381 | /* deactivate base64 encoding */ \ |
382 | size_t count; \ |
383 | \ |
384 | count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \ |
385 | if (__glibc_unlikely (outptr + count > outend)) \ |
386 | { \ |
387 | result = __GCONV_FULL_OUTPUT; \ |
388 | break; \ |
389 | } \ |
390 | \ |
391 | if ((statep->__count & 0x18) >= 0x10) \ |
392 | *outptr++ = base64 ((statep->__count >> 3) & ~3); \ |
393 | if (isxbase64 (ch)) \ |
394 | *outptr++ = '-'; \ |
395 | *outptr++ = (unsigned char) ch; \ |
396 | statep->__count = 0; \ |
397 | } \ |
398 | else \ |
399 | { \ |
400 | size_t count; \ |
401 | \ |
402 | if (ch < 0x10000) \ |
403 | count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2); \ |
404 | else if (ch < 0x110000) \ |
405 | count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5); \ |
406 | else \ |
407 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
408 | \ |
409 | if (__glibc_unlikely (outptr + count > outend)) \ |
410 | { \ |
411 | result = __GCONV_FULL_OUTPUT; \ |
412 | break; \ |
413 | } \ |
414 | \ |
415 | if (ch < 0x10000) \ |
416 | { \ |
417 | switch ((statep->__count >> 3) & 3) \ |
418 | { \ |
419 | case 1: \ |
420 | *outptr++ = base64 (ch >> 10); \ |
421 | *outptr++ = base64 ((ch >> 4) & 0x3f); \ |
422 | statep->__count = ((ch & 15) << 5) | (3 << 3); \ |
423 | break; \ |
424 | case 2: \ |
425 | *outptr++ = \ |
426 | base64 (((statep->__count >> 3) & ~3) | (ch >> 12)); \ |
427 | *outptr++ = base64 ((ch >> 6) & 0x3f); \ |
428 | *outptr++ = base64 (ch & 0x3f); \ |
429 | statep->__count = (1 << 3); \ |
430 | break; \ |
431 | case 3: \ |
432 | *outptr++ = \ |
433 | base64 (((statep->__count >> 3) & ~3) | (ch >> 14)); \ |
434 | *outptr++ = base64 ((ch >> 8) & 0x3f); \ |
435 | *outptr++ = base64 ((ch >> 2) & 0x3f); \ |
436 | statep->__count = ((ch & 3) << 7) | (2 << 3); \ |
437 | break; \ |
438 | default: \ |
439 | abort (); \ |
440 | } \ |
441 | } \ |
442 | else if (ch < 0x110000) \ |
443 | { \ |
444 | uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ |
445 | uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ |
446 | \ |
447 | ch = (ch1 << 16) | ch2; \ |
448 | switch ((statep->__count >> 3) & 3) \ |
449 | { \ |
450 | case 1: \ |
451 | *outptr++ = base64 (ch >> 26); \ |
452 | *outptr++ = base64 ((ch >> 20) & 0x3f); \ |
453 | *outptr++ = base64 ((ch >> 14) & 0x3f); \ |
454 | *outptr++ = base64 ((ch >> 8) & 0x3f); \ |
455 | *outptr++ = base64 ((ch >> 2) & 0x3f); \ |
456 | statep->__count = ((ch & 3) << 7) | (2 << 3); \ |
457 | break; \ |
458 | case 2: \ |
459 | *outptr++ = \ |
460 | base64 (((statep->__count >> 3) & ~3) | (ch >> 28)); \ |
461 | *outptr++ = base64 ((ch >> 22) & 0x3f); \ |
462 | *outptr++ = base64 ((ch >> 16) & 0x3f); \ |
463 | *outptr++ = base64 ((ch >> 10) & 0x3f); \ |
464 | *outptr++ = base64 ((ch >> 4) & 0x3f); \ |
465 | statep->__count = ((ch & 15) << 5) | (3 << 3); \ |
466 | break; \ |
467 | case 3: \ |
468 | *outptr++ = \ |
469 | base64 (((statep->__count >> 3) & ~3) | (ch >> 30)); \ |
470 | *outptr++ = base64 ((ch >> 24) & 0x3f); \ |
471 | *outptr++ = base64 ((ch >> 18) & 0x3f); \ |
472 | *outptr++ = base64 ((ch >> 12) & 0x3f); \ |
473 | *outptr++ = base64 ((ch >> 6) & 0x3f); \ |
474 | *outptr++ = base64 (ch & 0x3f); \ |
475 | statep->__count = (1 << 3); \ |
476 | break; \ |
477 | default: \ |
478 | abort (); \ |
479 | } \ |
480 | } \ |
481 | else \ |
482 | abort (); \ |
483 | } \ |
484 | } \ |
485 | \ |
486 | /* Now that we wrote the output increment the input pointer. */ \ |
487 | inptr += 4; \ |
488 | } |
489 | #define LOOP_NEED_FLAGS |
490 | #define , mbstate_t *statep |
491 | #include <iconv/loop.c> |
492 | |
493 | |
494 | /* Since this is a stateful encoding we have to provide code which resets |
495 | the output state to the initial state. This has to be done during the |
496 | flushing. */ |
497 | #define EMIT_SHIFT_TO_INIT \ |
498 | if (FROM_DIRECTION) \ |
499 | /* Nothing to emit. */ \ |
500 | memset (data->__statep, '\0', sizeof (mbstate_t)); \ |
501 | else \ |
502 | { \ |
503 | /* The "to UTF-7" direction. Flush the remaining bits and terminate \ |
504 | with a '-' byte. This will guarantee correct decoding if more \ |
505 | UTF-7 encoded text is added afterwards. */ \ |
506 | int state = data->__statep->__count; \ |
507 | \ |
508 | if (state & 0x18) \ |
509 | { \ |
510 | /* Deactivate base64 encoding. */ \ |
511 | size_t count = ((state & 0x18) >= 0x10) + 1; \ |
512 | \ |
513 | if (__glibc_unlikely (outbuf + count > outend)) \ |
514 | /* We don't have enough room in the output buffer. */ \ |
515 | status = __GCONV_FULL_OUTPUT; \ |
516 | else \ |
517 | { \ |
518 | /* Write out the shift sequence. */ \ |
519 | if ((state & 0x18) >= 0x10) \ |
520 | *outbuf++ = base64 ((state >> 3) & ~3); \ |
521 | *outbuf++ = '-'; \ |
522 | \ |
523 | data->__statep->__count = 0; \ |
524 | } \ |
525 | } \ |
526 | else \ |
527 | data->__statep->__count = 0; \ |
528 | } |
529 | |
530 | |
531 | /* Now define the toplevel functions. */ |
532 | #include <iconv/skeleton.c> |
533 | |