1 | /* Conversion module for ISO-2022-JP and ISO-2022-JP-2. |
2 | Copyright (C) 1998-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <assert.h> |
21 | #include <dlfcn.h> |
22 | #include <gconv.h> |
23 | #include <stdint.h> |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | #include "jis0201.h" |
27 | #include "jis0208.h" |
28 | #include "jis0212.h" |
29 | #include "gb2312.h" |
30 | #include "ksc5601.h" |
31 | |
32 | struct gap |
33 | { |
34 | uint16_t start; |
35 | uint16_t end; |
36 | int32_t idx; |
37 | }; |
38 | |
39 | #include "iso8859-7jp.h" |
40 | |
41 | /* This makes obvious what everybody knows: 0x1b is the Esc character. */ |
42 | #define ESC 0x1b |
43 | |
44 | /* We provide our own initialization and destructor function. */ |
45 | #define DEFINE_INIT 0 |
46 | #define DEFINE_FINI 0 |
47 | |
48 | /* Definitions used in the body of the `gconv' function. */ |
49 | #define FROM_LOOP from_iso2022jp_loop |
50 | #define TO_LOOP to_iso2022jp_loop |
51 | #define ONE_DIRECTION 0 |
52 | #define FROM_LOOP_MIN_NEEDED_FROM 1 |
53 | #define FROM_LOOP_MAX_NEEDED_FROM 4 |
54 | #define FROM_LOOP_MIN_NEEDED_TO 4 |
55 | #define FROM_LOOP_MAX_NEEDED_TO 4 |
56 | #define TO_LOOP_MIN_NEEDED_FROM 4 |
57 | #define TO_LOOP_MAX_NEEDED_FROM 4 |
58 | #define TO_LOOP_MIN_NEEDED_TO 1 |
59 | #define TO_LOOP_MAX_NEEDED_TO 6 |
60 | #define FROM_DIRECTION (dir == from_iso2022jp) |
61 | #define PREPARE_LOOP \ |
62 | enum direction dir = ((struct iso2022jp_data *) step->__data)->dir; \ |
63 | enum variant var = ((struct iso2022jp_data *) step->__data)->var; \ |
64 | int save_set; \ |
65 | int *setp = &data->__statep->__count; |
66 | #define , var, setp |
67 | |
68 | |
69 | /* Direction of the transformation. */ |
70 | enum direction |
71 | { |
72 | illegal_dir, |
73 | to_iso2022jp, |
74 | from_iso2022jp |
75 | }; |
76 | |
77 | /* We handle ISO-2022-jp and ISO-2022-JP-2 here. */ |
78 | enum variant |
79 | { |
80 | illegal_var, |
81 | iso2022jp, |
82 | iso2022jp2 |
83 | }; |
84 | |
85 | |
86 | struct iso2022jp_data |
87 | { |
88 | enum direction dir; |
89 | enum variant var; |
90 | }; |
91 | |
92 | |
93 | /* The COUNT element of the state keeps track of the currently selected |
94 | character set. The possible values are: */ |
95 | enum |
96 | { |
97 | ASCII_set = 0, |
98 | JISX0208_1978_set = 1 << 3, |
99 | JISX0208_1983_set = 2 << 3, |
100 | JISX0201_Roman_set = 3 << 3, |
101 | JISX0201_Kana_set = 4 << 3, |
102 | GB2312_set = 5 << 3, |
103 | KSC5601_set = 6 << 3, |
104 | JISX0212_set = 7 << 3, |
105 | CURRENT_SEL_MASK = 7 << 3 |
106 | }; |
107 | |
108 | /* The second value stored is the designation of the G2 set. The following |
109 | values are possible: */ |
110 | enum |
111 | { |
112 | UNSPECIFIED_set = 0, |
113 | ISO88591_set = 1 << 6, |
114 | ISO88597_set = 2 << 6, |
115 | CURRENT_ASSIGN_MASK = 3 << 6 |
116 | }; |
117 | |
118 | /* The third value, only used during conversion from Unicode to ISO-2022-JP-2, |
119 | describes the language tag parsing status. The possible values are as |
120 | follows. Values >= TAG_language are temporary tag parsing states. */ |
121 | enum |
122 | { |
123 | TAG_none = 0, |
124 | TAG_language = 4 << 8, |
125 | TAG_language_j = 5 << 8, |
126 | TAG_language_ja = 1 << 8, |
127 | TAG_language_k = 6 << 8, |
128 | TAG_language_ko = 2 << 8, |
129 | TAG_language_z = 7 << 8, |
130 | TAG_language_zh = 3 << 8, |
131 | CURRENT_TAG_MASK = 7 << 8 |
132 | }; |
133 | |
134 | |
135 | extern int gconv_init (struct __gconv_step *step); |
136 | int |
137 | gconv_init (struct __gconv_step *step) |
138 | { |
139 | /* Determine which direction. */ |
140 | struct iso2022jp_data *new_data; |
141 | enum direction dir = illegal_dir; |
142 | enum variant var = illegal_var; |
143 | int result; |
144 | |
145 | if (__strcasecmp (step->__from_name, "ISO-2022-JP//" ) == 0) |
146 | { |
147 | dir = from_iso2022jp; |
148 | var = iso2022jp; |
149 | } |
150 | else if (__strcasecmp (step->__to_name, "ISO-2022-JP//" ) == 0) |
151 | { |
152 | dir = to_iso2022jp; |
153 | var = iso2022jp; |
154 | } |
155 | else if (__strcasecmp (step->__from_name, "ISO-2022-JP-2//" ) == 0) |
156 | { |
157 | dir = from_iso2022jp; |
158 | var = iso2022jp2; |
159 | } |
160 | else if (__strcasecmp (step->__to_name, "ISO-2022-JP-2//" ) == 0) |
161 | { |
162 | dir = to_iso2022jp; |
163 | var = iso2022jp2; |
164 | } |
165 | |
166 | result = __GCONV_NOCONV; |
167 | if (__builtin_expect (dir, from_iso2022jp) != illegal_dir) |
168 | { |
169 | new_data |
170 | = (struct iso2022jp_data *) malloc (sizeof (struct iso2022jp_data)); |
171 | |
172 | result = __GCONV_NOMEM; |
173 | if (new_data != NULL) |
174 | { |
175 | new_data->dir = dir; |
176 | new_data->var = var; |
177 | step->__data = new_data; |
178 | |
179 | if (dir == from_iso2022jp) |
180 | { |
181 | step->__min_needed_from = FROM_LOOP_MIN_NEEDED_FROM; |
182 | step->__max_needed_from = FROM_LOOP_MAX_NEEDED_FROM; |
183 | step->__min_needed_to = FROM_LOOP_MIN_NEEDED_TO; |
184 | step->__max_needed_to = FROM_LOOP_MAX_NEEDED_TO; |
185 | } |
186 | else |
187 | { |
188 | step->__min_needed_from = TO_LOOP_MIN_NEEDED_FROM; |
189 | step->__max_needed_from = TO_LOOP_MAX_NEEDED_FROM; |
190 | step->__min_needed_to = TO_LOOP_MIN_NEEDED_TO; |
191 | step->__max_needed_to = TO_LOOP_MAX_NEEDED_TO; |
192 | } |
193 | |
194 | /* Yes, this is a stateful encoding. */ |
195 | step->__stateful = 1; |
196 | |
197 | result = __GCONV_OK; |
198 | } |
199 | } |
200 | |
201 | return result; |
202 | } |
203 | |
204 | |
205 | extern void gconv_end (struct __gconv_step *data); |
206 | void |
207 | gconv_end (struct __gconv_step *data) |
208 | { |
209 | free (data->__data); |
210 | } |
211 | |
212 | |
213 | /* Since this is a stateful encoding we have to provide code which resets |
214 | the output state to the initial state. This has to be done during the |
215 | flushing. */ |
216 | #define EMIT_SHIFT_TO_INIT \ |
217 | /* Avoid warning about unused variable 'var'. */ \ |
218 | (void) var; \ |
219 | \ |
220 | if ((data->__statep->__count & ~7) != ASCII_set) \ |
221 | { \ |
222 | if (dir == from_iso2022jp \ |
223 | || (data->__statep->__count & CURRENT_SEL_MASK) == ASCII_set) \ |
224 | { \ |
225 | /* It's easy, we don't have to emit anything, we just reset the \ |
226 | state for the input. Note that this also clears the G2 \ |
227 | designation. */ \ |
228 | data->__statep->__count &= 7; \ |
229 | data->__statep->__count |= ASCII_set; \ |
230 | } \ |
231 | else \ |
232 | { \ |
233 | /* We are not in the initial state. To switch back we have \ |
234 | to emit the sequence `Esc ( B'. */ \ |
235 | if (__glibc_unlikely (outbuf + 3 > outend)) \ |
236 | /* We don't have enough room in the output buffer. */ \ |
237 | status = __GCONV_FULL_OUTPUT; \ |
238 | else \ |
239 | { \ |
240 | /* Write out the shift sequence. */ \ |
241 | *outbuf++ = ESC; \ |
242 | *outbuf++ = '('; \ |
243 | *outbuf++ = 'B'; \ |
244 | /* Note that this also clears the G2 designation. */ \ |
245 | data->__statep->__count &= 7; \ |
246 | data->__statep->__count |= ASCII_set; \ |
247 | } \ |
248 | } \ |
249 | } |
250 | |
251 | |
252 | /* Since we might have to reset input pointer we must be able to save |
253 | and retore the state. */ |
254 | #define SAVE_RESET_STATE(Save) \ |
255 | if (Save) \ |
256 | save_set = *setp; \ |
257 | else \ |
258 | *setp = save_set |
259 | |
260 | |
261 | /* First define the conversion function from ISO-2022-JP to UCS4. */ |
262 | #define MIN_NEEDED_INPUT FROM_LOOP_MIN_NEEDED_FROM |
263 | #define MAX_NEEDED_INPUT FROM_LOOP_MAX_NEEDED_FROM |
264 | #define MIN_NEEDED_OUTPUT FROM_LOOP_MIN_NEEDED_TO |
265 | #define MAX_NEEDED_OUTPUT FROM_LOOP_MAX_NEEDED_TO |
266 | #define LOOPFCT FROM_LOOP |
267 | #define BODY \ |
268 | { \ |
269 | uint32_t ch = *inptr; \ |
270 | \ |
271 | /* Recognize escape sequences. */ \ |
272 | if (__builtin_expect (ch, 0) == ESC) \ |
273 | { \ |
274 | /* We now must be prepared to read two to three more \ |
275 | characters. If we have a match in the first character but \ |
276 | then the input buffer ends we terminate with an error since \ |
277 | we must not risk missing an escape sequence just because it \ |
278 | is not entirely in the current input buffer. */ \ |
279 | if (__builtin_expect (inptr + 2 >= inend, 0) \ |
280 | || (var == iso2022jp2 && inptr[1] == '$' && inptr[2] == '(' \ |
281 | && __builtin_expect (inptr + 3 >= inend, 0))) \ |
282 | { \ |
283 | /* Not enough input available. */ \ |
284 | result = __GCONV_INCOMPLETE_INPUT; \ |
285 | break; \ |
286 | } \ |
287 | \ |
288 | if (inptr[1] == '(') \ |
289 | { \ |
290 | if (inptr[2] == 'B') \ |
291 | { \ |
292 | /* ASCII selected. */ \ |
293 | set = ASCII_set; \ |
294 | inptr += 3; \ |
295 | continue; \ |
296 | } \ |
297 | else if (inptr[2] == 'J') \ |
298 | { \ |
299 | /* JIS X 0201 selected. */ \ |
300 | set = JISX0201_Roman_set; \ |
301 | inptr += 3; \ |
302 | continue; \ |
303 | } \ |
304 | else if (var == iso2022jp2 && inptr[2] == 'I') \ |
305 | { \ |
306 | /* JIS X 0201 selected. */ \ |
307 | set = JISX0201_Kana_set; \ |
308 | inptr += 3; \ |
309 | continue; \ |
310 | } \ |
311 | } \ |
312 | else if (inptr[1] == '$') \ |
313 | { \ |
314 | if (inptr[2] == '@') \ |
315 | { \ |
316 | /* JIS X 0208-1978 selected. */ \ |
317 | set = JISX0208_1978_set; \ |
318 | inptr += 3; \ |
319 | continue; \ |
320 | } \ |
321 | else if (inptr[2] == 'B') \ |
322 | { \ |
323 | /* JIS X 0208-1983 selected. */ \ |
324 | set = JISX0208_1983_set; \ |
325 | inptr += 3; \ |
326 | continue; \ |
327 | } \ |
328 | else if (var == iso2022jp2) \ |
329 | { \ |
330 | if (inptr[2] == 'A') \ |
331 | { \ |
332 | /* GB 2312-1980 selected. */ \ |
333 | set = GB2312_set; \ |
334 | inptr += 3; \ |
335 | continue; \ |
336 | } \ |
337 | else if (inptr[2] == '(') \ |
338 | { \ |
339 | if (inptr[3] == 'C') \ |
340 | { \ |
341 | /* KSC 5601-1987 selected. */ \ |
342 | set = KSC5601_set; \ |
343 | inptr += 4; \ |
344 | continue; \ |
345 | } \ |
346 | else if (inptr[3] == 'D') \ |
347 | { \ |
348 | /* JIS X 0212-1990 selected. */ \ |
349 | set = JISX0212_set; \ |
350 | inptr += 4; \ |
351 | continue; \ |
352 | } \ |
353 | } \ |
354 | } \ |
355 | } \ |
356 | else if (var == iso2022jp2 && inptr[1] == '.') \ |
357 | { \ |
358 | if (inptr[2] == 'A') \ |
359 | { \ |
360 | /* ISO 8859-1-GR selected. */ \ |
361 | set2 = ISO88591_set; \ |
362 | inptr += 3; \ |
363 | continue; \ |
364 | } \ |
365 | else if (inptr[2] == 'F') \ |
366 | { \ |
367 | /* ISO 8859-7-GR selected. */ \ |
368 | set2 = ISO88597_set; \ |
369 | inptr += 3; \ |
370 | continue; \ |
371 | } \ |
372 | } \ |
373 | } \ |
374 | \ |
375 | if (ch == ESC && var == iso2022jp2 && inptr[1] == 'N') \ |
376 | { \ |
377 | if (set2 == ISO88591_set) \ |
378 | { \ |
379 | ch = inptr[2] | 0x80; \ |
380 | inptr += 3; \ |
381 | } \ |
382 | else if (__builtin_expect (set2, ISO88597_set) == ISO88597_set) \ |
383 | { \ |
384 | /* We use the table from the ISO 8859-7 module. */ \ |
385 | if (inptr[2] < 0x20 || inptr[2] >= 0x80) \ |
386 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
387 | ch = iso88597_to_ucs4[inptr[2] - 0x20]; \ |
388 | if (ch == 0) \ |
389 | STANDARD_FROM_LOOP_ERR_HANDLER (3); \ |
390 | inptr += 3; \ |
391 | } \ |
392 | else \ |
393 | { \ |
394 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
395 | } \ |
396 | } \ |
397 | else if (ch >= 0x80) \ |
398 | { \ |
399 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
400 | } \ |
401 | else if (set == ASCII_set || (ch < 0x21 || ch == 0x7f)) \ |
402 | /* Almost done, just advance the input pointer. */ \ |
403 | ++inptr; \ |
404 | else if (set == JISX0201_Roman_set) \ |
405 | { \ |
406 | /* Use the JIS X 0201 table. */ \ |
407 | ch = jisx0201_to_ucs4 (ch); \ |
408 | if (__glibc_unlikely (ch == __UNKNOWN_10646_CHAR)) \ |
409 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
410 | ++inptr; \ |
411 | } \ |
412 | else if (set == JISX0201_Kana_set) \ |
413 | { \ |
414 | /* Use the JIS X 0201 table. */ \ |
415 | ch = jisx0201_to_ucs4 (ch + 0x80); \ |
416 | if (__glibc_unlikely (ch == __UNKNOWN_10646_CHAR)) \ |
417 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
418 | ++inptr; \ |
419 | } \ |
420 | else \ |
421 | { \ |
422 | if (set == JISX0208_1978_set || set == JISX0208_1983_set) \ |
423 | /* XXX I don't have the tables for these two old variants of \ |
424 | JIS X 0208. Therefore I'm using the tables for JIS X \ |
425 | 0208-1990. If somebody has problems with this please \ |
426 | provide the appropriate tables. */ \ |
427 | ch = jisx0208_to_ucs4 (&inptr, inend - inptr, 0); \ |
428 | else if (set == JISX0212_set) \ |
429 | /* Use the JIS X 0212 table. */ \ |
430 | ch = jisx0212_to_ucs4 (&inptr, inend - inptr, 0); \ |
431 | else if (set == GB2312_set) \ |
432 | /* Use the GB 2312 table. */ \ |
433 | ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \ |
434 | else \ |
435 | { \ |
436 | assert (set == KSC5601_set); \ |
437 | \ |
438 | /* Use the KSC 5601 table. */ \ |
439 | ch = ksc5601_to_ucs4 (&inptr, inend - inptr, 0); \ |
440 | } \ |
441 | \ |
442 | if (__glibc_unlikely (ch == 0)) \ |
443 | { \ |
444 | result = __GCONV_INCOMPLETE_INPUT; \ |
445 | break; \ |
446 | } \ |
447 | else if (__glibc_unlikely (ch == __UNKNOWN_10646_CHAR)) \ |
448 | { \ |
449 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
450 | } \ |
451 | } \ |
452 | \ |
453 | put32 (outptr, ch); \ |
454 | outptr += 4; \ |
455 | } |
456 | #define LOOP_NEED_FLAGS |
457 | #define , enum variant var, int *setp |
458 | #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ |
459 | int set2 = *setp & CURRENT_ASSIGN_MASK |
460 | #define UPDATE_PARAMS *setp = set | set2 |
461 | #include <iconv/loop.c> |
462 | |
463 | |
464 | /* Next, define the other direction. */ |
465 | |
466 | enum conversion { none = 0, european, japanese, chinese, korean, other }; |
467 | |
468 | /* A datatype for conversion lists. */ |
469 | typedef unsigned int cvlist_t; |
470 | #define CVLIST(cv1, cv2, cv3, cv4, cv5) \ |
471 | ((cv1) + ((cv2) << 3) + ((cv3) << 6) + ((cv4) << 9) + ((cv5) << 12)) |
472 | #define CVLIST_FIRST(cvl) ((cvl) & ((1 << 3) - 1)) |
473 | #define CVLIST_REST(cvl) ((cvl) >> 3) |
474 | static const cvlist_t conversion_lists[4] = |
475 | { |
476 | /* TAG_none */ CVLIST (japanese, european, chinese, korean, other), |
477 | /* TAG_language_ja */ CVLIST (japanese, european, chinese, korean, other), |
478 | /* TAG_language_ko */ CVLIST (korean, european, japanese, chinese, other), |
479 | /* TAG_language_zh */ CVLIST (chinese, european, japanese, korean, other) |
480 | }; |
481 | |
482 | #define MIN_NEEDED_INPUT TO_LOOP_MIN_NEEDED_FROM |
483 | #define MAX_NEEDED_INPUT TO_LOOP_MAX_NEEDED_FROM |
484 | #define MIN_NEEDED_OUTPUT TO_LOOP_MIN_NEEDED_TO |
485 | #define MAX_NEEDED_OUTPUT TO_LOOP_MAX_NEEDED_TO |
486 | #define LOOPFCT TO_LOOP |
487 | #define BODY \ |
488 | { \ |
489 | uint32_t ch; \ |
490 | size_t written; \ |
491 | \ |
492 | ch = get32 (inptr); \ |
493 | \ |
494 | if (var == iso2022jp2) \ |
495 | { \ |
496 | /* Handle Unicode tag characters (range U+E0000..U+E007F). */ \ |
497 | if (__glibc_unlikely ((ch >> 7) == (0xe0000 >> 7))) \ |
498 | { \ |
499 | ch &= 0x7f; \ |
500 | if (ch >= 'A' && ch <= 'Z') \ |
501 | ch += 'a' - 'A'; \ |
502 | if (ch == 0x01) \ |
503 | tag = TAG_language; \ |
504 | else if (ch == 'j' && tag == TAG_language) \ |
505 | tag = TAG_language_j; \ |
506 | else if (ch == 'a' && tag == TAG_language_j) \ |
507 | tag = TAG_language_ja; \ |
508 | else if (ch == 'k' && tag == TAG_language) \ |
509 | tag = TAG_language_k; \ |
510 | else if (ch == 'o' && tag == TAG_language_k) \ |
511 | tag = TAG_language_ko; \ |
512 | else if (ch == 'z' && tag == TAG_language) \ |
513 | tag = TAG_language_z; \ |
514 | else if (ch == 'h' && tag == TAG_language_z) \ |
515 | tag = TAG_language_zh; \ |
516 | else if (ch == 0x7f) \ |
517 | tag = TAG_none; \ |
518 | else \ |
519 | { \ |
520 | /* Other tag characters reset the tag parsing state (if the \ |
521 | current state is a temporary state) or are ignored (if \ |
522 | the current state is a stable one). */ \ |
523 | if (tag >= TAG_language) \ |
524 | tag = TAG_none; \ |
525 | } \ |
526 | \ |
527 | inptr += 4; \ |
528 | continue; \ |
529 | } \ |
530 | \ |
531 | /* Non-tag characters reset the tag parsing state, if the current \ |
532 | state is a temporary state. */ \ |
533 | if (__glibc_unlikely (tag >= TAG_language)) \ |
534 | tag = TAG_none; \ |
535 | } \ |
536 | \ |
537 | /* First see whether we can write the character using the currently \ |
538 | selected character set. But ignore the selected character set if \ |
539 | the current language tag shows different preferences. */ \ |
540 | if (set == ASCII_set) \ |
541 | { \ |
542 | /* Please note that the NUL byte is *not* matched if we are not \ |
543 | currently using the ASCII charset. This is because we must \ |
544 | switch to the initial state whenever a NUL byte is written. */ \ |
545 | if (ch <= 0x7f) \ |
546 | { \ |
547 | *outptr++ = ch; \ |
548 | written = 1; \ |
549 | \ |
550 | /* At the beginning of a line, G2 designation is cleared. */ \ |
551 | if (var == iso2022jp2 && ch == 0x0a) \ |
552 | set2 = UNSPECIFIED_set; \ |
553 | } \ |
554 | else \ |
555 | written = __UNKNOWN_10646_CHAR; \ |
556 | } \ |
557 | /* ISO-2022-JP recommends to encode the newline character always in \ |
558 | ASCII since this allows a context-free interpretation of the \ |
559 | characters at the beginning of the next line. Otherwise it would \ |
560 | have to be known whether the last line ended using ASCII or \ |
561 | JIS X 0201. */ \ |
562 | else if (set == JISX0201_Roman_set \ |
563 | && (__builtin_expect (tag == TAG_none, 1) \ |
564 | || tag == TAG_language_ja)) \ |
565 | { \ |
566 | unsigned char buf[1]; \ |
567 | written = ucs4_to_jisx0201 (ch, buf); \ |
568 | if (written != __UNKNOWN_10646_CHAR) \ |
569 | { \ |
570 | if (buf[0] > 0x20 && buf[0] < 0x80) \ |
571 | { \ |
572 | *outptr++ = buf[0]; \ |
573 | written = 1; \ |
574 | } \ |
575 | else \ |
576 | written = __UNKNOWN_10646_CHAR; \ |
577 | } \ |
578 | } \ |
579 | else if (set == JISX0201_Kana_set \ |
580 | && (__builtin_expect (tag == TAG_none, 1) \ |
581 | || tag == TAG_language_ja)) \ |
582 | { \ |
583 | unsigned char buf[1]; \ |
584 | written = ucs4_to_jisx0201 (ch, buf); \ |
585 | if (written != __UNKNOWN_10646_CHAR) \ |
586 | { \ |
587 | if (buf[0] > 0xa0 && buf[0] < 0xe0) \ |
588 | { \ |
589 | *outptr++ = buf[0] - 0x80; \ |
590 | written = 1; \ |
591 | } \ |
592 | else \ |
593 | written = __UNKNOWN_10646_CHAR; \ |
594 | } \ |
595 | } \ |
596 | else \ |
597 | { \ |
598 | if ((set == JISX0208_1978_set || set == JISX0208_1983_set) \ |
599 | && (__builtin_expect (tag == TAG_none, 1) \ |
600 | || tag == TAG_language_ja)) \ |
601 | written = ucs4_to_jisx0208 (ch, outptr, outend - outptr); \ |
602 | else if (set == JISX0212_set \ |
603 | && (__builtin_expect (tag == TAG_none, 1) \ |
604 | || tag == TAG_language_ja)) \ |
605 | written = ucs4_to_jisx0212 (ch, outptr, outend - outptr); \ |
606 | else if (set == GB2312_set \ |
607 | && (__builtin_expect (tag == TAG_none, 1) \ |
608 | || tag == TAG_language_zh)) \ |
609 | written = ucs4_to_gb2312 (ch, outptr, outend - outptr); \ |
610 | else if (set == KSC5601_set \ |
611 | && (__builtin_expect (tag == TAG_none, 1) \ |
612 | || tag == TAG_language_ko)) \ |
613 | written = ucs4_to_ksc5601 (ch, outptr, outend - outptr); \ |
614 | else \ |
615 | written = __UNKNOWN_10646_CHAR; \ |
616 | \ |
617 | if (__glibc_unlikely (written == 0)) \ |
618 | { \ |
619 | result = __GCONV_FULL_OUTPUT; \ |
620 | break; \ |
621 | } \ |
622 | else if (written != __UNKNOWN_10646_CHAR) \ |
623 | outptr += written; \ |
624 | } \ |
625 | \ |
626 | if (written == __UNKNOWN_10646_CHAR \ |
627 | && __builtin_expect (tag == TAG_none, 1)) \ |
628 | { \ |
629 | if (set2 == ISO88591_set) \ |
630 | { \ |
631 | if (ch >= 0x80 && ch <= 0xff) \ |
632 | { \ |
633 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
634 | { \ |
635 | result = __GCONV_FULL_OUTPUT; \ |
636 | break; \ |
637 | } \ |
638 | \ |
639 | *outptr++ = ESC; \ |
640 | *outptr++ = 'N'; \ |
641 | *outptr++ = ch & 0x7f; \ |
642 | written = 3; \ |
643 | } \ |
644 | } \ |
645 | else if (set2 == ISO88597_set) \ |
646 | { \ |
647 | if (__glibc_likely (ch < 0xffff)) \ |
648 | { \ |
649 | const struct gap *rp = from_idx; \ |
650 | \ |
651 | while (ch > rp->end) \ |
652 | ++rp; \ |
653 | if (ch >= rp->start) \ |
654 | { \ |
655 | unsigned char res = \ |
656 | iso88597_from_ucs4[ch - 0xa0 + rp->idx]; \ |
657 | if (res != '\0') \ |
658 | { \ |
659 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
660 | { \ |
661 | result = __GCONV_FULL_OUTPUT; \ |
662 | break; \ |
663 | } \ |
664 | \ |
665 | *outptr++ = ESC; \ |
666 | *outptr++ = 'N'; \ |
667 | *outptr++ = res & 0x7f; \ |
668 | written = 3; \ |
669 | } \ |
670 | } \ |
671 | } \ |
672 | } \ |
673 | } \ |
674 | \ |
675 | if (written == __UNKNOWN_10646_CHAR) \ |
676 | { \ |
677 | /* The attempts to use the currently selected character set \ |
678 | failed, either because the language tag changed, or because \ |
679 | the character requires a different character set, or because \ |
680 | the character is unknown. \ |
681 | The CJK character sets partially overlap when seen as subsets \ |
682 | of ISO 10646; therefore there is no single correct result. \ |
683 | We use a preferrence order which depends on the language tag. */ \ |
684 | \ |
685 | if (ch <= 0x7f) \ |
686 | { \ |
687 | /* We must encode using ASCII. First write out the \ |
688 | escape sequence. */ \ |
689 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
690 | { \ |
691 | result = __GCONV_FULL_OUTPUT; \ |
692 | break; \ |
693 | } \ |
694 | \ |
695 | *outptr++ = ESC; \ |
696 | *outptr++ = '('; \ |
697 | *outptr++ = 'B'; \ |
698 | set = ASCII_set; \ |
699 | \ |
700 | if (__glibc_unlikely (outptr + 1 > outend)) \ |
701 | { \ |
702 | result = __GCONV_FULL_OUTPUT; \ |
703 | break; \ |
704 | } \ |
705 | *outptr++ = ch; \ |
706 | \ |
707 | /* At the beginning of a line, G2 designation is cleared. */ \ |
708 | if (var == iso2022jp2 && ch == 0x0a) \ |
709 | set2 = UNSPECIFIED_set; \ |
710 | } \ |
711 | else \ |
712 | { \ |
713 | /* Now it becomes difficult. We must search the other \ |
714 | character sets one by one. Use an ordered conversion \ |
715 | list that depends on the current language tag. */ \ |
716 | cvlist_t conversion_list; \ |
717 | unsigned char buf[2]; \ |
718 | int res = __GCONV_ILLEGAL_INPUT; \ |
719 | \ |
720 | if (var == iso2022jp2) \ |
721 | conversion_list = conversion_lists[tag >> 8]; \ |
722 | else \ |
723 | conversion_list = CVLIST (japanese, 0, 0, 0, 0); \ |
724 | \ |
725 | do \ |
726 | switch (CVLIST_FIRST (conversion_list)) \ |
727 | { \ |
728 | case european: \ |
729 | \ |
730 | /* Try ISO 8859-1 upper half. */ \ |
731 | if (ch >= 0x80 && ch <= 0xff) \ |
732 | { \ |
733 | if (set2 != ISO88591_set) \ |
734 | { \ |
735 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
736 | { \ |
737 | res = __GCONV_FULL_OUTPUT; \ |
738 | break; \ |
739 | } \ |
740 | *outptr++ = ESC; \ |
741 | *outptr++ = '.'; \ |
742 | *outptr++ = 'A'; \ |
743 | set2 = ISO88591_set; \ |
744 | } \ |
745 | \ |
746 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
747 | { \ |
748 | res = __GCONV_FULL_OUTPUT; \ |
749 | break; \ |
750 | } \ |
751 | *outptr++ = ESC; \ |
752 | *outptr++ = 'N'; \ |
753 | *outptr++ = ch - 0x80; \ |
754 | res = __GCONV_OK; \ |
755 | break; \ |
756 | } \ |
757 | \ |
758 | /* Try ISO 8859-7 upper half. */ \ |
759 | if (__glibc_likely (ch < 0xffff)) \ |
760 | { \ |
761 | const struct gap *rp = from_idx; \ |
762 | \ |
763 | while (ch > rp->end) \ |
764 | ++rp; \ |
765 | if (ch >= rp->start) \ |
766 | { \ |
767 | unsigned char ch2 = \ |
768 | iso88597_from_ucs4[ch - 0xa0 + rp->idx]; \ |
769 | if (ch2 != '\0') \ |
770 | { \ |
771 | if (set2 != ISO88597_set) \ |
772 | { \ |
773 | if (__builtin_expect (outptr + 3 > outend, \ |
774 | 0)) \ |
775 | { \ |
776 | res = __GCONV_FULL_OUTPUT; \ |
777 | break; \ |
778 | } \ |
779 | *outptr++ = ESC; \ |
780 | *outptr++ = '.'; \ |
781 | *outptr++ = 'F'; \ |
782 | set2 = ISO88597_set; \ |
783 | } \ |
784 | \ |
785 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
786 | { \ |
787 | res = __GCONV_FULL_OUTPUT; \ |
788 | break; \ |
789 | } \ |
790 | *outptr++ = ESC; \ |
791 | *outptr++ = 'N'; \ |
792 | *outptr++ = ch2 - 0x80; \ |
793 | res = __GCONV_OK; \ |
794 | break; \ |
795 | } \ |
796 | } \ |
797 | } \ |
798 | \ |
799 | break; \ |
800 | \ |
801 | case japanese: \ |
802 | \ |
803 | /* Try JIS X 0201 Roman. */ \ |
804 | written = ucs4_to_jisx0201 (ch, buf); \ |
805 | if (written != __UNKNOWN_10646_CHAR \ |
806 | && buf[0] > 0x20 && buf[0] < 0x80) \ |
807 | { \ |
808 | if (set != JISX0201_Roman_set) \ |
809 | { \ |
810 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
811 | { \ |
812 | res = __GCONV_FULL_OUTPUT; \ |
813 | break; \ |
814 | } \ |
815 | *outptr++ = ESC; \ |
816 | *outptr++ = '('; \ |
817 | *outptr++ = 'J'; \ |
818 | set = JISX0201_Roman_set; \ |
819 | } \ |
820 | \ |
821 | if (__glibc_unlikely (outptr + 1 > outend)) \ |
822 | { \ |
823 | res = __GCONV_FULL_OUTPUT; \ |
824 | break; \ |
825 | } \ |
826 | *outptr++ = buf[0]; \ |
827 | res = __GCONV_OK; \ |
828 | break; \ |
829 | } \ |
830 | \ |
831 | /* Try JIS X 0208. */ \ |
832 | written = ucs4_to_jisx0208 (ch, buf, 2); \ |
833 | if (written != __UNKNOWN_10646_CHAR) \ |
834 | { \ |
835 | if (set != JISX0208_1983_set) \ |
836 | { \ |
837 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
838 | { \ |
839 | res = __GCONV_FULL_OUTPUT; \ |
840 | break; \ |
841 | } \ |
842 | *outptr++ = ESC; \ |
843 | *outptr++ = '$'; \ |
844 | *outptr++ = 'B'; \ |
845 | set = JISX0208_1983_set; \ |
846 | } \ |
847 | \ |
848 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
849 | { \ |
850 | res = __GCONV_FULL_OUTPUT; \ |
851 | break; \ |
852 | } \ |
853 | *outptr++ = buf[0]; \ |
854 | *outptr++ = buf[1]; \ |
855 | res = __GCONV_OK; \ |
856 | break; \ |
857 | } \ |
858 | \ |
859 | if (__glibc_unlikely (var == iso2022jp)) \ |
860 | /* Don't use the other Japanese character sets. */ \ |
861 | break; \ |
862 | \ |
863 | /* Try JIS X 0212. */ \ |
864 | written = ucs4_to_jisx0212 (ch, buf, 2); \ |
865 | if (written != __UNKNOWN_10646_CHAR) \ |
866 | { \ |
867 | if (set != JISX0212_set) \ |
868 | { \ |
869 | if (__builtin_expect (outptr + 4 > outend, 0)) \ |
870 | { \ |
871 | res = __GCONV_FULL_OUTPUT; \ |
872 | break; \ |
873 | } \ |
874 | *outptr++ = ESC; \ |
875 | *outptr++ = '$'; \ |
876 | *outptr++ = '('; \ |
877 | *outptr++ = 'D'; \ |
878 | set = JISX0212_set; \ |
879 | } \ |
880 | \ |
881 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
882 | { \ |
883 | res = __GCONV_FULL_OUTPUT; \ |
884 | break; \ |
885 | } \ |
886 | *outptr++ = buf[0]; \ |
887 | *outptr++ = buf[1]; \ |
888 | res = __GCONV_OK; \ |
889 | break; \ |
890 | } \ |
891 | \ |
892 | break; \ |
893 | \ |
894 | case chinese: \ |
895 | assert (var == iso2022jp2); \ |
896 | \ |
897 | /* Try GB 2312. */ \ |
898 | written = ucs4_to_gb2312 (ch, buf, 2); \ |
899 | if (written != __UNKNOWN_10646_CHAR) \ |
900 | { \ |
901 | if (set != GB2312_set) \ |
902 | { \ |
903 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
904 | { \ |
905 | res = __GCONV_FULL_OUTPUT; \ |
906 | break; \ |
907 | } \ |
908 | *outptr++ = ESC; \ |
909 | *outptr++ = '$'; \ |
910 | *outptr++ = 'A'; \ |
911 | set = GB2312_set; \ |
912 | } \ |
913 | \ |
914 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
915 | { \ |
916 | res = __GCONV_FULL_OUTPUT; \ |
917 | break; \ |
918 | } \ |
919 | *outptr++ = buf[0]; \ |
920 | *outptr++ = buf[1]; \ |
921 | res = __GCONV_OK; \ |
922 | break; \ |
923 | } \ |
924 | \ |
925 | break; \ |
926 | \ |
927 | case korean: \ |
928 | assert (var == iso2022jp2); \ |
929 | \ |
930 | /* Try KSC 5601. */ \ |
931 | written = ucs4_to_ksc5601 (ch, buf, 2); \ |
932 | if (written != __UNKNOWN_10646_CHAR) \ |
933 | { \ |
934 | if (set != KSC5601_set) \ |
935 | { \ |
936 | if (__builtin_expect (outptr + 4 > outend, 0)) \ |
937 | { \ |
938 | res = __GCONV_FULL_OUTPUT; \ |
939 | break; \ |
940 | } \ |
941 | *outptr++ = ESC; \ |
942 | *outptr++ = '$'; \ |
943 | *outptr++ = '('; \ |
944 | *outptr++ = 'C'; \ |
945 | set = KSC5601_set; \ |
946 | } \ |
947 | \ |
948 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
949 | { \ |
950 | res = __GCONV_FULL_OUTPUT; \ |
951 | break; \ |
952 | } \ |
953 | *outptr++ = buf[0]; \ |
954 | *outptr++ = buf[1]; \ |
955 | res = __GCONV_OK; \ |
956 | break; \ |
957 | } \ |
958 | \ |
959 | break; \ |
960 | \ |
961 | case other: \ |
962 | assert (var == iso2022jp2); \ |
963 | \ |
964 | /* Try JIS X 0201 Kana. This is not officially part \ |
965 | of ISO-2022-JP-2, according to RFC 1554. Therefore \ |
966 | we try this only after all other attempts. */ \ |
967 | written = ucs4_to_jisx0201 (ch, buf); \ |
968 | if (written != __UNKNOWN_10646_CHAR && buf[0] >= 0x80) \ |
969 | { \ |
970 | if (set != JISX0201_Kana_set) \ |
971 | { \ |
972 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
973 | { \ |
974 | res = __GCONV_FULL_OUTPUT; \ |
975 | break; \ |
976 | } \ |
977 | *outptr++ = ESC; \ |
978 | *outptr++ = '('; \ |
979 | *outptr++ = 'I'; \ |
980 | set = JISX0201_Kana_set; \ |
981 | } \ |
982 | \ |
983 | if (__glibc_unlikely (outptr + 1 > outend)) \ |
984 | { \ |
985 | res = __GCONV_FULL_OUTPUT; \ |
986 | break; \ |
987 | } \ |
988 | *outptr++ = buf[0] - 0x80; \ |
989 | res = __GCONV_OK; \ |
990 | break; \ |
991 | } \ |
992 | \ |
993 | break; \ |
994 | \ |
995 | default: \ |
996 | abort (); \ |
997 | } \ |
998 | while (res == __GCONV_ILLEGAL_INPUT \ |
999 | && (conversion_list = CVLIST_REST (conversion_list)) != 0);\ |
1000 | \ |
1001 | if (res == __GCONV_FULL_OUTPUT) \ |
1002 | { \ |
1003 | result = res; \ |
1004 | break; \ |
1005 | } \ |
1006 | \ |
1007 | if (res == __GCONV_ILLEGAL_INPUT) \ |
1008 | { \ |
1009 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
1010 | } \ |
1011 | } \ |
1012 | } \ |
1013 | \ |
1014 | /* Now that we wrote the output increment the input pointer. */ \ |
1015 | inptr += 4; \ |
1016 | } |
1017 | #define LOOP_NEED_FLAGS |
1018 | #define , enum variant var, int *setp |
1019 | #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ |
1020 | int set2 = *setp & CURRENT_ASSIGN_MASK; \ |
1021 | int tag = *setp & CURRENT_TAG_MASK; |
1022 | #define REINIT_PARAMS do \ |
1023 | { \ |
1024 | set = *setp & CURRENT_SEL_MASK; \ |
1025 | set2 = *setp & CURRENT_ASSIGN_MASK; \ |
1026 | tag = *setp & CURRENT_TAG_MASK; \ |
1027 | } \ |
1028 | while (0) |
1029 | #define UPDATE_PARAMS *setp = set | set2 | tag |
1030 | #include <iconv/loop.c> |
1031 | |
1032 | |
1033 | /* Now define the toplevel functions. */ |
1034 | #include <iconv/skeleton.c> |
1035 | |