1 | /* Conversion module for ISO-2022-JP and ISO-2022-JP-2. |
2 | Copyright (C) 1998-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <assert.h> |
20 | #include <dlfcn.h> |
21 | #include <gconv.h> |
22 | #include <stdint.h> |
23 | #include <stdlib.h> |
24 | #include <string.h> |
25 | #include "jis0201.h" |
26 | #include "jis0208.h" |
27 | #include "jis0212.h" |
28 | #include "gb2312.h" |
29 | #include "ksc5601.h" |
30 | |
31 | struct gap |
32 | { |
33 | uint16_t start; |
34 | uint16_t end; |
35 | int32_t idx; |
36 | }; |
37 | |
38 | #include "iso8859-7jp.h" |
39 | |
40 | /* This makes obvious what everybody knows: 0x1b is the Esc character. */ |
41 | #define ESC 0x1b |
42 | |
43 | /* We provide our own initialization and destructor function. */ |
44 | #define DEFINE_INIT 0 |
45 | #define DEFINE_FINI 0 |
46 | |
47 | /* Definitions used in the body of the `gconv' function. */ |
48 | #define FROM_LOOP from_iso2022jp_loop |
49 | #define TO_LOOP to_iso2022jp_loop |
50 | #define ONE_DIRECTION 0 |
51 | #define FROM_LOOP_MIN_NEEDED_FROM 1 |
52 | #define FROM_LOOP_MAX_NEEDED_FROM 4 |
53 | #define FROM_LOOP_MIN_NEEDED_TO 4 |
54 | #define FROM_LOOP_MAX_NEEDED_TO 4 |
55 | #define TO_LOOP_MIN_NEEDED_FROM 4 |
56 | #define TO_LOOP_MAX_NEEDED_FROM 4 |
57 | #define TO_LOOP_MIN_NEEDED_TO 1 |
58 | #define TO_LOOP_MAX_NEEDED_TO 6 |
59 | #define FROM_DIRECTION (dir == from_iso2022jp) |
60 | #define PREPARE_LOOP \ |
61 | enum direction dir = ((struct iso2022jp_data *) step->__data)->dir; \ |
62 | enum variant var = ((struct iso2022jp_data *) step->__data)->var; \ |
63 | int save_set; \ |
64 | int *setp = &data->__statep->__count; |
65 | #define , var, setp |
66 | |
67 | |
68 | /* Direction of the transformation. */ |
69 | enum direction |
70 | { |
71 | illegal_dir, |
72 | to_iso2022jp, |
73 | from_iso2022jp |
74 | }; |
75 | |
76 | /* We handle ISO-2022-jp and ISO-2022-JP-2 here. */ |
77 | enum variant |
78 | { |
79 | illegal_var, |
80 | iso2022jp, |
81 | iso2022jp2 |
82 | }; |
83 | |
84 | |
85 | struct iso2022jp_data |
86 | { |
87 | enum direction dir; |
88 | enum variant var; |
89 | }; |
90 | |
91 | |
92 | /* The COUNT element of the state keeps track of the currently selected |
93 | character set. The possible values are: */ |
94 | enum |
95 | { |
96 | ASCII_set = 0, |
97 | JISX0208_1978_set = 1 << 3, |
98 | JISX0208_1983_set = 2 << 3, |
99 | JISX0201_Roman_set = 3 << 3, |
100 | JISX0201_Kana_set = 4 << 3, |
101 | GB2312_set = 5 << 3, |
102 | KSC5601_set = 6 << 3, |
103 | JISX0212_set = 7 << 3, |
104 | CURRENT_SEL_MASK = 7 << 3 |
105 | }; |
106 | |
107 | /* The second value stored is the designation of the G2 set. The following |
108 | values are possible: */ |
109 | enum |
110 | { |
111 | UNSPECIFIED_set = 0, |
112 | ISO88591_set = 1 << 6, |
113 | ISO88597_set = 2 << 6, |
114 | CURRENT_ASSIGN_MASK = 3 << 6 |
115 | }; |
116 | |
117 | /* The third value, only used during conversion from Unicode to ISO-2022-JP-2, |
118 | describes the language tag parsing status. The possible values are as |
119 | follows. Values >= TAG_language are temporary tag parsing states. */ |
120 | enum |
121 | { |
122 | TAG_none = 0, |
123 | TAG_language = 4 << 8, |
124 | TAG_language_j = 5 << 8, |
125 | TAG_language_ja = 1 << 8, |
126 | TAG_language_k = 6 << 8, |
127 | TAG_language_ko = 2 << 8, |
128 | TAG_language_z = 7 << 8, |
129 | TAG_language_zh = 3 << 8, |
130 | CURRENT_TAG_MASK = 7 << 8 |
131 | }; |
132 | |
133 | |
134 | extern int gconv_init (struct __gconv_step *step); |
135 | int |
136 | gconv_init (struct __gconv_step *step) |
137 | { |
138 | /* Determine which direction. */ |
139 | struct iso2022jp_data *new_data; |
140 | enum direction dir = illegal_dir; |
141 | enum variant var = illegal_var; |
142 | int result; |
143 | |
144 | if (__strcasecmp (step->__from_name, "ISO-2022-JP//" ) == 0) |
145 | { |
146 | dir = from_iso2022jp; |
147 | var = iso2022jp; |
148 | } |
149 | else if (__strcasecmp (step->__to_name, "ISO-2022-JP//" ) == 0) |
150 | { |
151 | dir = to_iso2022jp; |
152 | var = iso2022jp; |
153 | } |
154 | else if (__strcasecmp (step->__from_name, "ISO-2022-JP-2//" ) == 0) |
155 | { |
156 | dir = from_iso2022jp; |
157 | var = iso2022jp2; |
158 | } |
159 | else if (__strcasecmp (step->__to_name, "ISO-2022-JP-2//" ) == 0) |
160 | { |
161 | dir = to_iso2022jp; |
162 | var = iso2022jp2; |
163 | } |
164 | |
165 | result = __GCONV_NOCONV; |
166 | if (__builtin_expect (dir, from_iso2022jp) != illegal_dir) |
167 | { |
168 | new_data |
169 | = (struct iso2022jp_data *) malloc (sizeof (struct iso2022jp_data)); |
170 | |
171 | result = __GCONV_NOMEM; |
172 | if (new_data != NULL) |
173 | { |
174 | new_data->dir = dir; |
175 | new_data->var = var; |
176 | step->__data = new_data; |
177 | |
178 | if (dir == from_iso2022jp) |
179 | { |
180 | step->__min_needed_from = FROM_LOOP_MIN_NEEDED_FROM; |
181 | step->__max_needed_from = FROM_LOOP_MAX_NEEDED_FROM; |
182 | step->__min_needed_to = FROM_LOOP_MIN_NEEDED_TO; |
183 | step->__max_needed_to = FROM_LOOP_MAX_NEEDED_TO; |
184 | } |
185 | else |
186 | { |
187 | step->__min_needed_from = TO_LOOP_MIN_NEEDED_FROM; |
188 | step->__max_needed_from = TO_LOOP_MAX_NEEDED_FROM; |
189 | step->__min_needed_to = TO_LOOP_MIN_NEEDED_TO; |
190 | step->__max_needed_to = TO_LOOP_MAX_NEEDED_TO; |
191 | } |
192 | |
193 | /* Yes, this is a stateful encoding. */ |
194 | step->__stateful = 1; |
195 | |
196 | result = __GCONV_OK; |
197 | } |
198 | } |
199 | |
200 | return result; |
201 | } |
202 | |
203 | |
204 | extern void gconv_end (struct __gconv_step *data); |
205 | void |
206 | gconv_end (struct __gconv_step *data) |
207 | { |
208 | free (data->__data); |
209 | } |
210 | |
211 | |
212 | /* Since this is a stateful encoding we have to provide code which resets |
213 | the output state to the initial state. This has to be done during the |
214 | flushing. */ |
215 | #define EMIT_SHIFT_TO_INIT \ |
216 | /* Avoid warning about unused variable 'var'. */ \ |
217 | (void) var; \ |
218 | \ |
219 | if ((data->__statep->__count & ~7) != ASCII_set) \ |
220 | { \ |
221 | if (dir == from_iso2022jp \ |
222 | || (data->__statep->__count & CURRENT_SEL_MASK) == ASCII_set) \ |
223 | { \ |
224 | /* It's easy, we don't have to emit anything, we just reset the \ |
225 | state for the input. Note that this also clears the G2 \ |
226 | designation. */ \ |
227 | data->__statep->__count &= 7; \ |
228 | data->__statep->__count |= ASCII_set; \ |
229 | } \ |
230 | else \ |
231 | { \ |
232 | /* We are not in the initial state. To switch back we have \ |
233 | to emit the sequence `Esc ( B'. */ \ |
234 | if (__glibc_unlikely (outbuf + 3 > outend)) \ |
235 | /* We don't have enough room in the output buffer. */ \ |
236 | status = __GCONV_FULL_OUTPUT; \ |
237 | else \ |
238 | { \ |
239 | /* Write out the shift sequence. */ \ |
240 | *outbuf++ = ESC; \ |
241 | *outbuf++ = '('; \ |
242 | *outbuf++ = 'B'; \ |
243 | /* Note that this also clears the G2 designation. */ \ |
244 | data->__statep->__count &= 7; \ |
245 | data->__statep->__count |= ASCII_set; \ |
246 | } \ |
247 | } \ |
248 | } |
249 | |
250 | |
251 | /* Since we might have to reset input pointer we must be able to save |
252 | and retore the state. */ |
253 | #define SAVE_RESET_STATE(Save) \ |
254 | if (Save) \ |
255 | save_set = *setp; \ |
256 | else \ |
257 | *setp = save_set |
258 | |
259 | |
260 | /* First define the conversion function from ISO-2022-JP to UCS4. */ |
261 | #define MIN_NEEDED_INPUT FROM_LOOP_MIN_NEEDED_FROM |
262 | #define MAX_NEEDED_INPUT FROM_LOOP_MAX_NEEDED_FROM |
263 | #define MIN_NEEDED_OUTPUT FROM_LOOP_MIN_NEEDED_TO |
264 | #define MAX_NEEDED_OUTPUT FROM_LOOP_MAX_NEEDED_TO |
265 | #define LOOPFCT FROM_LOOP |
266 | #define BODY \ |
267 | { \ |
268 | uint32_t ch = *inptr; \ |
269 | \ |
270 | /* Recognize escape sequences. */ \ |
271 | if (__builtin_expect (ch, 0) == ESC) \ |
272 | { \ |
273 | /* We now must be prepared to read two to three more \ |
274 | characters. If we have a match in the first character but \ |
275 | then the input buffer ends we terminate with an error since \ |
276 | we must not risk missing an escape sequence just because it \ |
277 | is not entirely in the current input buffer. */ \ |
278 | if (__builtin_expect (inptr + 2 >= inend, 0) \ |
279 | || (var == iso2022jp2 && inptr[1] == '$' && inptr[2] == '(' \ |
280 | && __builtin_expect (inptr + 3 >= inend, 0))) \ |
281 | { \ |
282 | /* Not enough input available. */ \ |
283 | result = __GCONV_INCOMPLETE_INPUT; \ |
284 | break; \ |
285 | } \ |
286 | \ |
287 | if (inptr[1] == '(') \ |
288 | { \ |
289 | if (inptr[2] == 'B') \ |
290 | { \ |
291 | /* ASCII selected. */ \ |
292 | set = ASCII_set; \ |
293 | inptr += 3; \ |
294 | continue; \ |
295 | } \ |
296 | else if (inptr[2] == 'J') \ |
297 | { \ |
298 | /* JIS X 0201 selected. */ \ |
299 | set = JISX0201_Roman_set; \ |
300 | inptr += 3; \ |
301 | continue; \ |
302 | } \ |
303 | else if (var == iso2022jp2 && inptr[2] == 'I') \ |
304 | { \ |
305 | /* JIS X 0201 selected. */ \ |
306 | set = JISX0201_Kana_set; \ |
307 | inptr += 3; \ |
308 | continue; \ |
309 | } \ |
310 | } \ |
311 | else if (inptr[1] == '$') \ |
312 | { \ |
313 | if (inptr[2] == '@') \ |
314 | { \ |
315 | /* JIS X 0208-1978 selected. */ \ |
316 | set = JISX0208_1978_set; \ |
317 | inptr += 3; \ |
318 | continue; \ |
319 | } \ |
320 | else if (inptr[2] == 'B') \ |
321 | { \ |
322 | /* JIS X 0208-1983 selected. */ \ |
323 | set = JISX0208_1983_set; \ |
324 | inptr += 3; \ |
325 | continue; \ |
326 | } \ |
327 | else if (var == iso2022jp2) \ |
328 | { \ |
329 | if (inptr[2] == 'A') \ |
330 | { \ |
331 | /* GB 2312-1980 selected. */ \ |
332 | set = GB2312_set; \ |
333 | inptr += 3; \ |
334 | continue; \ |
335 | } \ |
336 | else if (inptr[2] == '(') \ |
337 | { \ |
338 | if (inptr[3] == 'C') \ |
339 | { \ |
340 | /* KSC 5601-1987 selected. */ \ |
341 | set = KSC5601_set; \ |
342 | inptr += 4; \ |
343 | continue; \ |
344 | } \ |
345 | else if (inptr[3] == 'D') \ |
346 | { \ |
347 | /* JIS X 0212-1990 selected. */ \ |
348 | set = JISX0212_set; \ |
349 | inptr += 4; \ |
350 | continue; \ |
351 | } \ |
352 | } \ |
353 | } \ |
354 | } \ |
355 | else if (var == iso2022jp2 && inptr[1] == '.') \ |
356 | { \ |
357 | if (inptr[2] == 'A') \ |
358 | { \ |
359 | /* ISO 8859-1-GR selected. */ \ |
360 | set2 = ISO88591_set; \ |
361 | inptr += 3; \ |
362 | continue; \ |
363 | } \ |
364 | else if (inptr[2] == 'F') \ |
365 | { \ |
366 | /* ISO 8859-7-GR selected. */ \ |
367 | set2 = ISO88597_set; \ |
368 | inptr += 3; \ |
369 | continue; \ |
370 | } \ |
371 | } \ |
372 | } \ |
373 | \ |
374 | if (ch == ESC && var == iso2022jp2 && inptr[1] == 'N') \ |
375 | { \ |
376 | if (set2 == ISO88591_set) \ |
377 | { \ |
378 | ch = inptr[2] | 0x80; \ |
379 | inptr += 3; \ |
380 | } \ |
381 | else if (__builtin_expect (set2, ISO88597_set) == ISO88597_set) \ |
382 | { \ |
383 | /* We use the table from the ISO 8859-7 module. */ \ |
384 | if (inptr[2] < 0x20 || inptr[2] >= 0x80) \ |
385 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
386 | ch = iso88597_to_ucs4[inptr[2] - 0x20]; \ |
387 | if (ch == 0) \ |
388 | STANDARD_FROM_LOOP_ERR_HANDLER (3); \ |
389 | inptr += 3; \ |
390 | } \ |
391 | else \ |
392 | { \ |
393 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
394 | } \ |
395 | } \ |
396 | else if (ch >= 0x80) \ |
397 | { \ |
398 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
399 | } \ |
400 | else if (set == ASCII_set || (ch < 0x21 || ch == 0x7f)) \ |
401 | /* Almost done, just advance the input pointer. */ \ |
402 | ++inptr; \ |
403 | else if (set == JISX0201_Roman_set) \ |
404 | { \ |
405 | /* Use the JIS X 0201 table. */ \ |
406 | ch = jisx0201_to_ucs4 (ch); \ |
407 | if (__glibc_unlikely (ch == __UNKNOWN_10646_CHAR)) \ |
408 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
409 | ++inptr; \ |
410 | } \ |
411 | else if (set == JISX0201_Kana_set) \ |
412 | { \ |
413 | /* Use the JIS X 0201 table. */ \ |
414 | ch = jisx0201_to_ucs4 (ch + 0x80); \ |
415 | if (__glibc_unlikely (ch == __UNKNOWN_10646_CHAR)) \ |
416 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
417 | ++inptr; \ |
418 | } \ |
419 | else \ |
420 | { \ |
421 | if (set == JISX0208_1978_set || set == JISX0208_1983_set) \ |
422 | /* XXX I don't have the tables for these two old variants of \ |
423 | JIS X 0208. Therefore I'm using the tables for JIS X \ |
424 | 0208-1990. If somebody has problems with this please \ |
425 | provide the appropriate tables. */ \ |
426 | ch = jisx0208_to_ucs4 (&inptr, inend - inptr, 0); \ |
427 | else if (set == JISX0212_set) \ |
428 | /* Use the JIS X 0212 table. */ \ |
429 | ch = jisx0212_to_ucs4 (&inptr, inend - inptr, 0); \ |
430 | else if (set == GB2312_set) \ |
431 | /* Use the GB 2312 table. */ \ |
432 | ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \ |
433 | else \ |
434 | { \ |
435 | assert (set == KSC5601_set); \ |
436 | \ |
437 | /* Use the KSC 5601 table. */ \ |
438 | ch = ksc5601_to_ucs4 (&inptr, inend - inptr, 0); \ |
439 | } \ |
440 | \ |
441 | if (__glibc_unlikely (ch == 0)) \ |
442 | { \ |
443 | result = __GCONV_INCOMPLETE_INPUT; \ |
444 | break; \ |
445 | } \ |
446 | else if (__glibc_unlikely (ch == __UNKNOWN_10646_CHAR)) \ |
447 | { \ |
448 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
449 | } \ |
450 | } \ |
451 | \ |
452 | put32 (outptr, ch); \ |
453 | outptr += 4; \ |
454 | } |
455 | #define LOOP_NEED_FLAGS |
456 | #define , enum variant var, int *setp |
457 | #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ |
458 | int set2 = *setp & CURRENT_ASSIGN_MASK |
459 | #define UPDATE_PARAMS *setp = set | set2 |
460 | #include <iconv/loop.c> |
461 | |
462 | |
463 | /* Next, define the other direction. */ |
464 | |
465 | enum conversion { none = 0, european, japanese, chinese, korean, other }; |
466 | |
467 | /* A datatype for conversion lists. */ |
468 | typedef unsigned int cvlist_t; |
469 | #define CVLIST(cv1, cv2, cv3, cv4, cv5) \ |
470 | ((cv1) + ((cv2) << 3) + ((cv3) << 6) + ((cv4) << 9) + ((cv5) << 12)) |
471 | #define CVLIST_FIRST(cvl) ((cvl) & ((1 << 3) - 1)) |
472 | #define CVLIST_REST(cvl) ((cvl) >> 3) |
473 | static const cvlist_t conversion_lists[4] = |
474 | { |
475 | /* TAG_none */ CVLIST (japanese, european, chinese, korean, other), |
476 | /* TAG_language_ja */ CVLIST (japanese, european, chinese, korean, other), |
477 | /* TAG_language_ko */ CVLIST (korean, european, japanese, chinese, other), |
478 | /* TAG_language_zh */ CVLIST (chinese, european, japanese, korean, other) |
479 | }; |
480 | |
481 | #define MIN_NEEDED_INPUT TO_LOOP_MIN_NEEDED_FROM |
482 | #define MAX_NEEDED_INPUT TO_LOOP_MAX_NEEDED_FROM |
483 | #define MIN_NEEDED_OUTPUT TO_LOOP_MIN_NEEDED_TO |
484 | #define MAX_NEEDED_OUTPUT TO_LOOP_MAX_NEEDED_TO |
485 | #define LOOPFCT TO_LOOP |
486 | #define BODY \ |
487 | { \ |
488 | uint32_t ch; \ |
489 | size_t written; \ |
490 | \ |
491 | ch = get32 (inptr); \ |
492 | \ |
493 | if (var == iso2022jp2) \ |
494 | { \ |
495 | /* Handle Unicode tag characters (range U+E0000..U+E007F). */ \ |
496 | if (__glibc_unlikely ((ch >> 7) == (0xe0000 >> 7))) \ |
497 | { \ |
498 | ch &= 0x7f; \ |
499 | if (ch >= 'A' && ch <= 'Z') \ |
500 | ch += 'a' - 'A'; \ |
501 | if (ch == 0x01) \ |
502 | tag = TAG_language; \ |
503 | else if (ch == 'j' && tag == TAG_language) \ |
504 | tag = TAG_language_j; \ |
505 | else if (ch == 'a' && tag == TAG_language_j) \ |
506 | tag = TAG_language_ja; \ |
507 | else if (ch == 'k' && tag == TAG_language) \ |
508 | tag = TAG_language_k; \ |
509 | else if (ch == 'o' && tag == TAG_language_k) \ |
510 | tag = TAG_language_ko; \ |
511 | else if (ch == 'z' && tag == TAG_language) \ |
512 | tag = TAG_language_z; \ |
513 | else if (ch == 'h' && tag == TAG_language_z) \ |
514 | tag = TAG_language_zh; \ |
515 | else if (ch == 0x7f) \ |
516 | tag = TAG_none; \ |
517 | else \ |
518 | { \ |
519 | /* Other tag characters reset the tag parsing state (if the \ |
520 | current state is a temporary state) or are ignored (if \ |
521 | the current state is a stable one). */ \ |
522 | if (tag >= TAG_language) \ |
523 | tag = TAG_none; \ |
524 | } \ |
525 | \ |
526 | inptr += 4; \ |
527 | continue; \ |
528 | } \ |
529 | \ |
530 | /* Non-tag characters reset the tag parsing state, if the current \ |
531 | state is a temporary state. */ \ |
532 | if (__glibc_unlikely (tag >= TAG_language)) \ |
533 | tag = TAG_none; \ |
534 | } \ |
535 | \ |
536 | /* First see whether we can write the character using the currently \ |
537 | selected character set. But ignore the selected character set if \ |
538 | the current language tag shows different preferences. */ \ |
539 | if (set == ASCII_set) \ |
540 | { \ |
541 | /* Please note that the NUL byte is *not* matched if we are not \ |
542 | currently using the ASCII charset. This is because we must \ |
543 | switch to the initial state whenever a NUL byte is written. */ \ |
544 | if (ch <= 0x7f) \ |
545 | { \ |
546 | *outptr++ = ch; \ |
547 | written = 1; \ |
548 | \ |
549 | /* At the beginning of a line, G2 designation is cleared. */ \ |
550 | if (var == iso2022jp2 && ch == 0x0a) \ |
551 | set2 = UNSPECIFIED_set; \ |
552 | } \ |
553 | else \ |
554 | written = __UNKNOWN_10646_CHAR; \ |
555 | } \ |
556 | /* ISO-2022-JP recommends to encode the newline character always in \ |
557 | ASCII since this allows a context-free interpretation of the \ |
558 | characters at the beginning of the next line. Otherwise it would \ |
559 | have to be known whether the last line ended using ASCII or \ |
560 | JIS X 0201. */ \ |
561 | else if (set == JISX0201_Roman_set \ |
562 | && (__builtin_expect (tag == TAG_none, 1) \ |
563 | || tag == TAG_language_ja)) \ |
564 | { \ |
565 | unsigned char buf[1]; \ |
566 | written = ucs4_to_jisx0201 (ch, buf); \ |
567 | if (written != __UNKNOWN_10646_CHAR) \ |
568 | { \ |
569 | if (buf[0] > 0x20 && buf[0] < 0x80) \ |
570 | { \ |
571 | *outptr++ = buf[0]; \ |
572 | written = 1; \ |
573 | } \ |
574 | else \ |
575 | written = __UNKNOWN_10646_CHAR; \ |
576 | } \ |
577 | } \ |
578 | else if (set == JISX0201_Kana_set \ |
579 | && (__builtin_expect (tag == TAG_none, 1) \ |
580 | || tag == TAG_language_ja)) \ |
581 | { \ |
582 | unsigned char buf[1]; \ |
583 | written = ucs4_to_jisx0201 (ch, buf); \ |
584 | if (written != __UNKNOWN_10646_CHAR) \ |
585 | { \ |
586 | if (buf[0] > 0xa0 && buf[0] < 0xe0) \ |
587 | { \ |
588 | *outptr++ = buf[0] - 0x80; \ |
589 | written = 1; \ |
590 | } \ |
591 | else \ |
592 | written = __UNKNOWN_10646_CHAR; \ |
593 | } \ |
594 | } \ |
595 | else \ |
596 | { \ |
597 | if ((set == JISX0208_1978_set || set == JISX0208_1983_set) \ |
598 | && (__builtin_expect (tag == TAG_none, 1) \ |
599 | || tag == TAG_language_ja)) \ |
600 | written = ucs4_to_jisx0208 (ch, outptr, outend - outptr); \ |
601 | else if (set == JISX0212_set \ |
602 | && (__builtin_expect (tag == TAG_none, 1) \ |
603 | || tag == TAG_language_ja)) \ |
604 | written = ucs4_to_jisx0212 (ch, outptr, outend - outptr); \ |
605 | else if (set == GB2312_set \ |
606 | && (__builtin_expect (tag == TAG_none, 1) \ |
607 | || tag == TAG_language_zh)) \ |
608 | written = ucs4_to_gb2312 (ch, outptr, outend - outptr); \ |
609 | else if (set == KSC5601_set \ |
610 | && (__builtin_expect (tag == TAG_none, 1) \ |
611 | || tag == TAG_language_ko)) \ |
612 | written = ucs4_to_ksc5601 (ch, outptr, outend - outptr); \ |
613 | else \ |
614 | written = __UNKNOWN_10646_CHAR; \ |
615 | \ |
616 | if (__glibc_unlikely (written == 0)) \ |
617 | { \ |
618 | result = __GCONV_FULL_OUTPUT; \ |
619 | break; \ |
620 | } \ |
621 | else if (written != __UNKNOWN_10646_CHAR) \ |
622 | outptr += written; \ |
623 | } \ |
624 | \ |
625 | if (written == __UNKNOWN_10646_CHAR \ |
626 | && __builtin_expect (tag == TAG_none, 1)) \ |
627 | { \ |
628 | if (set2 == ISO88591_set) \ |
629 | { \ |
630 | if (ch >= 0x80 && ch <= 0xff) \ |
631 | { \ |
632 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
633 | { \ |
634 | result = __GCONV_FULL_OUTPUT; \ |
635 | break; \ |
636 | } \ |
637 | \ |
638 | *outptr++ = ESC; \ |
639 | *outptr++ = 'N'; \ |
640 | *outptr++ = ch & 0x7f; \ |
641 | written = 3; \ |
642 | } \ |
643 | } \ |
644 | else if (set2 == ISO88597_set) \ |
645 | { \ |
646 | if (__glibc_likely (ch < 0xffff)) \ |
647 | { \ |
648 | const struct gap *rp = from_idx; \ |
649 | \ |
650 | while (ch > rp->end) \ |
651 | ++rp; \ |
652 | if (ch >= rp->start) \ |
653 | { \ |
654 | unsigned char res = \ |
655 | iso88597_from_ucs4[ch - 0xa0 + rp->idx]; \ |
656 | if (res != '\0') \ |
657 | { \ |
658 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
659 | { \ |
660 | result = __GCONV_FULL_OUTPUT; \ |
661 | break; \ |
662 | } \ |
663 | \ |
664 | *outptr++ = ESC; \ |
665 | *outptr++ = 'N'; \ |
666 | *outptr++ = res & 0x7f; \ |
667 | written = 3; \ |
668 | } \ |
669 | } \ |
670 | } \ |
671 | } \ |
672 | } \ |
673 | \ |
674 | if (written == __UNKNOWN_10646_CHAR) \ |
675 | { \ |
676 | /* The attempts to use the currently selected character set \ |
677 | failed, either because the language tag changed, or because \ |
678 | the character requires a different character set, or because \ |
679 | the character is unknown. \ |
680 | The CJK character sets partially overlap when seen as subsets \ |
681 | of ISO 10646; therefore there is no single correct result. \ |
682 | We use a preferrence order which depends on the language tag. */ \ |
683 | \ |
684 | if (ch <= 0x7f) \ |
685 | { \ |
686 | /* We must encode using ASCII. First write out the \ |
687 | escape sequence. */ \ |
688 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
689 | { \ |
690 | result = __GCONV_FULL_OUTPUT; \ |
691 | break; \ |
692 | } \ |
693 | \ |
694 | *outptr++ = ESC; \ |
695 | *outptr++ = '('; \ |
696 | *outptr++ = 'B'; \ |
697 | set = ASCII_set; \ |
698 | \ |
699 | if (__glibc_unlikely (outptr + 1 > outend)) \ |
700 | { \ |
701 | result = __GCONV_FULL_OUTPUT; \ |
702 | break; \ |
703 | } \ |
704 | *outptr++ = ch; \ |
705 | \ |
706 | /* At the beginning of a line, G2 designation is cleared. */ \ |
707 | if (var == iso2022jp2 && ch == 0x0a) \ |
708 | set2 = UNSPECIFIED_set; \ |
709 | } \ |
710 | else \ |
711 | { \ |
712 | /* Now it becomes difficult. We must search the other \ |
713 | character sets one by one. Use an ordered conversion \ |
714 | list that depends on the current language tag. */ \ |
715 | cvlist_t conversion_list; \ |
716 | unsigned char buf[2]; \ |
717 | int res = __GCONV_ILLEGAL_INPUT; \ |
718 | \ |
719 | if (var == iso2022jp2) \ |
720 | conversion_list = conversion_lists[tag >> 8]; \ |
721 | else \ |
722 | conversion_list = CVLIST (japanese, 0, 0, 0, 0); \ |
723 | \ |
724 | do \ |
725 | switch (CVLIST_FIRST (conversion_list)) \ |
726 | { \ |
727 | case european: \ |
728 | \ |
729 | /* Try ISO 8859-1 upper half. */ \ |
730 | if (ch >= 0x80 && ch <= 0xff) \ |
731 | { \ |
732 | if (set2 != ISO88591_set) \ |
733 | { \ |
734 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
735 | { \ |
736 | res = __GCONV_FULL_OUTPUT; \ |
737 | break; \ |
738 | } \ |
739 | *outptr++ = ESC; \ |
740 | *outptr++ = '.'; \ |
741 | *outptr++ = 'A'; \ |
742 | set2 = ISO88591_set; \ |
743 | } \ |
744 | \ |
745 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
746 | { \ |
747 | res = __GCONV_FULL_OUTPUT; \ |
748 | break; \ |
749 | } \ |
750 | *outptr++ = ESC; \ |
751 | *outptr++ = 'N'; \ |
752 | *outptr++ = ch - 0x80; \ |
753 | res = __GCONV_OK; \ |
754 | break; \ |
755 | } \ |
756 | \ |
757 | /* Try ISO 8859-7 upper half. */ \ |
758 | if (__glibc_likely (ch < 0xffff)) \ |
759 | { \ |
760 | const struct gap *rp = from_idx; \ |
761 | \ |
762 | while (ch > rp->end) \ |
763 | ++rp; \ |
764 | if (ch >= rp->start) \ |
765 | { \ |
766 | unsigned char ch2 = \ |
767 | iso88597_from_ucs4[ch - 0xa0 + rp->idx]; \ |
768 | if (ch2 != '\0') \ |
769 | { \ |
770 | if (set2 != ISO88597_set) \ |
771 | { \ |
772 | if (__builtin_expect (outptr + 3 > outend, \ |
773 | 0)) \ |
774 | { \ |
775 | res = __GCONV_FULL_OUTPUT; \ |
776 | break; \ |
777 | } \ |
778 | *outptr++ = ESC; \ |
779 | *outptr++ = '.'; \ |
780 | *outptr++ = 'F'; \ |
781 | set2 = ISO88597_set; \ |
782 | } \ |
783 | \ |
784 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
785 | { \ |
786 | res = __GCONV_FULL_OUTPUT; \ |
787 | break; \ |
788 | } \ |
789 | *outptr++ = ESC; \ |
790 | *outptr++ = 'N'; \ |
791 | *outptr++ = ch2 - 0x80; \ |
792 | res = __GCONV_OK; \ |
793 | break; \ |
794 | } \ |
795 | } \ |
796 | } \ |
797 | \ |
798 | break; \ |
799 | \ |
800 | case japanese: \ |
801 | \ |
802 | /* Try JIS X 0201 Roman. */ \ |
803 | written = ucs4_to_jisx0201 (ch, buf); \ |
804 | if (written != __UNKNOWN_10646_CHAR \ |
805 | && buf[0] > 0x20 && buf[0] < 0x80) \ |
806 | { \ |
807 | if (set != JISX0201_Roman_set) \ |
808 | { \ |
809 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
810 | { \ |
811 | res = __GCONV_FULL_OUTPUT; \ |
812 | break; \ |
813 | } \ |
814 | *outptr++ = ESC; \ |
815 | *outptr++ = '('; \ |
816 | *outptr++ = 'J'; \ |
817 | set = JISX0201_Roman_set; \ |
818 | } \ |
819 | \ |
820 | if (__glibc_unlikely (outptr + 1 > outend)) \ |
821 | { \ |
822 | res = __GCONV_FULL_OUTPUT; \ |
823 | break; \ |
824 | } \ |
825 | *outptr++ = buf[0]; \ |
826 | res = __GCONV_OK; \ |
827 | break; \ |
828 | } \ |
829 | \ |
830 | /* Try JIS X 0208. */ \ |
831 | written = ucs4_to_jisx0208 (ch, buf, 2); \ |
832 | if (written != __UNKNOWN_10646_CHAR) \ |
833 | { \ |
834 | if (set != JISX0208_1983_set) \ |
835 | { \ |
836 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
837 | { \ |
838 | res = __GCONV_FULL_OUTPUT; \ |
839 | break; \ |
840 | } \ |
841 | *outptr++ = ESC; \ |
842 | *outptr++ = '$'; \ |
843 | *outptr++ = 'B'; \ |
844 | set = JISX0208_1983_set; \ |
845 | } \ |
846 | \ |
847 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
848 | { \ |
849 | res = __GCONV_FULL_OUTPUT; \ |
850 | break; \ |
851 | } \ |
852 | *outptr++ = buf[0]; \ |
853 | *outptr++ = buf[1]; \ |
854 | res = __GCONV_OK; \ |
855 | break; \ |
856 | } \ |
857 | \ |
858 | if (__glibc_unlikely (var == iso2022jp)) \ |
859 | /* Don't use the other Japanese character sets. */ \ |
860 | break; \ |
861 | \ |
862 | /* Try JIS X 0212. */ \ |
863 | written = ucs4_to_jisx0212 (ch, buf, 2); \ |
864 | if (written != __UNKNOWN_10646_CHAR) \ |
865 | { \ |
866 | if (set != JISX0212_set) \ |
867 | { \ |
868 | if (__builtin_expect (outptr + 4 > outend, 0)) \ |
869 | { \ |
870 | res = __GCONV_FULL_OUTPUT; \ |
871 | break; \ |
872 | } \ |
873 | *outptr++ = ESC; \ |
874 | *outptr++ = '$'; \ |
875 | *outptr++ = '('; \ |
876 | *outptr++ = 'D'; \ |
877 | set = JISX0212_set; \ |
878 | } \ |
879 | \ |
880 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
881 | { \ |
882 | res = __GCONV_FULL_OUTPUT; \ |
883 | break; \ |
884 | } \ |
885 | *outptr++ = buf[0]; \ |
886 | *outptr++ = buf[1]; \ |
887 | res = __GCONV_OK; \ |
888 | break; \ |
889 | } \ |
890 | \ |
891 | break; \ |
892 | \ |
893 | case chinese: \ |
894 | assert (var == iso2022jp2); \ |
895 | \ |
896 | /* Try GB 2312. */ \ |
897 | written = ucs4_to_gb2312 (ch, buf, 2); \ |
898 | if (written != __UNKNOWN_10646_CHAR) \ |
899 | { \ |
900 | if (set != GB2312_set) \ |
901 | { \ |
902 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
903 | { \ |
904 | res = __GCONV_FULL_OUTPUT; \ |
905 | break; \ |
906 | } \ |
907 | *outptr++ = ESC; \ |
908 | *outptr++ = '$'; \ |
909 | *outptr++ = 'A'; \ |
910 | set = GB2312_set; \ |
911 | } \ |
912 | \ |
913 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
914 | { \ |
915 | res = __GCONV_FULL_OUTPUT; \ |
916 | break; \ |
917 | } \ |
918 | *outptr++ = buf[0]; \ |
919 | *outptr++ = buf[1]; \ |
920 | res = __GCONV_OK; \ |
921 | break; \ |
922 | } \ |
923 | \ |
924 | break; \ |
925 | \ |
926 | case korean: \ |
927 | assert (var == iso2022jp2); \ |
928 | \ |
929 | /* Try KSC 5601. */ \ |
930 | written = ucs4_to_ksc5601 (ch, buf, 2); \ |
931 | if (written != __UNKNOWN_10646_CHAR) \ |
932 | { \ |
933 | if (set != KSC5601_set) \ |
934 | { \ |
935 | if (__builtin_expect (outptr + 4 > outend, 0)) \ |
936 | { \ |
937 | res = __GCONV_FULL_OUTPUT; \ |
938 | break; \ |
939 | } \ |
940 | *outptr++ = ESC; \ |
941 | *outptr++ = '$'; \ |
942 | *outptr++ = '('; \ |
943 | *outptr++ = 'C'; \ |
944 | set = KSC5601_set; \ |
945 | } \ |
946 | \ |
947 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
948 | { \ |
949 | res = __GCONV_FULL_OUTPUT; \ |
950 | break; \ |
951 | } \ |
952 | *outptr++ = buf[0]; \ |
953 | *outptr++ = buf[1]; \ |
954 | res = __GCONV_OK; \ |
955 | break; \ |
956 | } \ |
957 | \ |
958 | break; \ |
959 | \ |
960 | case other: \ |
961 | assert (var == iso2022jp2); \ |
962 | \ |
963 | /* Try JIS X 0201 Kana. This is not officially part \ |
964 | of ISO-2022-JP-2, according to RFC 1554. Therefore \ |
965 | we try this only after all other attempts. */ \ |
966 | written = ucs4_to_jisx0201 (ch, buf); \ |
967 | if (written != __UNKNOWN_10646_CHAR && buf[0] >= 0x80) \ |
968 | { \ |
969 | if (set != JISX0201_Kana_set) \ |
970 | { \ |
971 | if (__builtin_expect (outptr + 3 > outend, 0)) \ |
972 | { \ |
973 | res = __GCONV_FULL_OUTPUT; \ |
974 | break; \ |
975 | } \ |
976 | *outptr++ = ESC; \ |
977 | *outptr++ = '('; \ |
978 | *outptr++ = 'I'; \ |
979 | set = JISX0201_Kana_set; \ |
980 | } \ |
981 | \ |
982 | if (__glibc_unlikely (outptr + 1 > outend)) \ |
983 | { \ |
984 | res = __GCONV_FULL_OUTPUT; \ |
985 | break; \ |
986 | } \ |
987 | *outptr++ = buf[0] - 0x80; \ |
988 | res = __GCONV_OK; \ |
989 | break; \ |
990 | } \ |
991 | \ |
992 | break; \ |
993 | \ |
994 | default: \ |
995 | abort (); \ |
996 | } \ |
997 | while (res == __GCONV_ILLEGAL_INPUT \ |
998 | && (conversion_list = CVLIST_REST (conversion_list)) != 0);\ |
999 | \ |
1000 | if (res == __GCONV_FULL_OUTPUT) \ |
1001 | { \ |
1002 | result = res; \ |
1003 | break; \ |
1004 | } \ |
1005 | \ |
1006 | if (res == __GCONV_ILLEGAL_INPUT) \ |
1007 | { \ |
1008 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
1009 | } \ |
1010 | } \ |
1011 | } \ |
1012 | \ |
1013 | /* Now that we wrote the output increment the input pointer. */ \ |
1014 | inptr += 4; \ |
1015 | } |
1016 | #define LOOP_NEED_FLAGS |
1017 | #define , enum variant var, int *setp |
1018 | #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ |
1019 | int set2 = *setp & CURRENT_ASSIGN_MASK; \ |
1020 | int tag = *setp & CURRENT_TAG_MASK; |
1021 | #define REINIT_PARAMS do \ |
1022 | { \ |
1023 | set = *setp & CURRENT_SEL_MASK; \ |
1024 | set2 = *setp & CURRENT_ASSIGN_MASK; \ |
1025 | tag = *setp & CURRENT_TAG_MASK; \ |
1026 | } \ |
1027 | while (0) |
1028 | #define UPDATE_PARAMS *setp = set | set2 | tag |
1029 | #include <iconv/loop.c> |
1030 | |
1031 | |
1032 | /* Now define the toplevel functions. */ |
1033 | #include <iconv/skeleton.c> |
1034 | |