1 | /* Conversion module for Unicode |
2 | Copyright (C) 1999-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <byteswap.h> |
20 | #include <dlfcn.h> |
21 | #include <gconv.h> |
22 | #include <stddef.h> |
23 | #include <stdint.h> |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | |
27 | /* This is the Byte Order Mark character (BOM). */ |
28 | #define BOM 0xfeff |
29 | /* And in the other endian format. */ |
30 | #define BOM_OE 0xfffe |
31 | |
32 | |
33 | /* Definitions used in the body of the `gconv' function. */ |
34 | #define FROM_LOOP from_unicode_loop |
35 | #define TO_LOOP to_unicode_loop |
36 | #define DEFINE_INIT 0 |
37 | #define DEFINE_FINI 0 |
38 | #define MIN_NEEDED_FROM 2 |
39 | #define MIN_NEEDED_TO 4 |
40 | #define ONE_DIRECTION 0 |
41 | #define FROM_DIRECTION (dir == from_unicode) |
42 | #define PREPARE_LOOP \ |
43 | enum direction dir = ((struct unicode_data *) step->__data)->dir; \ |
44 | int swap; \ |
45 | if (FROM_DIRECTION) \ |
46 | { \ |
47 | if (data->__invocation_counter == 0) \ |
48 | { \ |
49 | /* We have to find out which byte order the file is encoded in. */ \ |
50 | if (inptr + 2 > inend) \ |
51 | return (inptr == inend \ |
52 | ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); \ |
53 | \ |
54 | if (get16u (inptr) == BOM) \ |
55 | /* Simply ignore the BOM character. */ \ |
56 | *inptrp = inptr += 2; \ |
57 | else if (get16u (inptr) == BOM_OE) \ |
58 | { \ |
59 | data->__flags |= __GCONV_SWAP; \ |
60 | *inptrp = inptr += 2; \ |
61 | } \ |
62 | } \ |
63 | } \ |
64 | else if (!data->__internal_use && data->__invocation_counter == 0) \ |
65 | { \ |
66 | /* Emit the Byte Order Mark. */ \ |
67 | if (__glibc_unlikely (outbuf + 2 > outend)) \ |
68 | return __GCONV_FULL_OUTPUT; \ |
69 | \ |
70 | put16u (outbuf, BOM); \ |
71 | outbuf += 2; \ |
72 | } \ |
73 | swap = data->__flags & __GCONV_SWAP; |
74 | #define , swap |
75 | |
76 | |
77 | /* Direction of the transformation. */ |
78 | enum direction |
79 | { |
80 | illegal_dir, |
81 | to_unicode, |
82 | from_unicode |
83 | }; |
84 | |
85 | struct unicode_data |
86 | { |
87 | enum direction dir; |
88 | }; |
89 | |
90 | |
91 | extern int gconv_init (struct __gconv_step *step); |
92 | int |
93 | gconv_init (struct __gconv_step *step) |
94 | { |
95 | /* Determine which direction. */ |
96 | struct unicode_data *new_data; |
97 | enum direction dir = illegal_dir; |
98 | int result; |
99 | |
100 | if (strcmp (step->__from_name, "UNICODE//" ) == 0) |
101 | dir = from_unicode; |
102 | else |
103 | dir = to_unicode; |
104 | |
105 | new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data)); |
106 | |
107 | result = __GCONV_NOMEM; |
108 | if (new_data != NULL) |
109 | { |
110 | new_data->dir = dir; |
111 | step->__data = new_data; |
112 | |
113 | if (dir == from_unicode) |
114 | { |
115 | step->__min_needed_from = MIN_NEEDED_FROM; |
116 | step->__max_needed_from = MIN_NEEDED_FROM; |
117 | step->__min_needed_to = MIN_NEEDED_TO; |
118 | step->__max_needed_to = MIN_NEEDED_TO; |
119 | } |
120 | else |
121 | { |
122 | step->__min_needed_from = MIN_NEEDED_TO; |
123 | step->__max_needed_from = MIN_NEEDED_TO; |
124 | step->__min_needed_to = MIN_NEEDED_FROM; |
125 | step->__max_needed_to = MIN_NEEDED_FROM; |
126 | } |
127 | |
128 | step->__stateful = 0; |
129 | |
130 | result = __GCONV_OK; |
131 | } |
132 | |
133 | return result; |
134 | } |
135 | |
136 | |
137 | extern void gconv_end (struct __gconv_step *data); |
138 | void |
139 | gconv_end (struct __gconv_step *data) |
140 | { |
141 | free (data->__data); |
142 | } |
143 | |
144 | |
145 | /* Convert from the internal (UCS4-like) format to UCS2. */ |
146 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO |
147 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM |
148 | #define LOOPFCT TO_LOOP |
149 | #define BODY \ |
150 | { \ |
151 | uint32_t c = get32 (inptr); \ |
152 | \ |
153 | if (__glibc_unlikely (c >= 0x10000)) \ |
154 | { \ |
155 | UNICODE_TAG_HANDLER (c, 4); \ |
156 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
157 | } \ |
158 | else if (__glibc_unlikely (c >= 0xd800 && c < 0xe000)) \ |
159 | { \ |
160 | /* Surrogate characters in UCS-4 input are not valid. \ |
161 | We must catch this, because the UCS-2 output might be \ |
162 | interpreted as UTF-16 by other programs. If we let \ |
163 | surrogates pass through, attackers could make a security \ |
164 | hole exploit by synthesizing any desired plane 1-16 \ |
165 | character. */ \ |
166 | result = __GCONV_ILLEGAL_INPUT; \ |
167 | if (! ignore_errors_p ()) \ |
168 | break; \ |
169 | inptr += 4; \ |
170 | ++*irreversible; \ |
171 | continue; \ |
172 | } \ |
173 | else \ |
174 | { \ |
175 | put16 (outptr, c); \ |
176 | outptr += 2; \ |
177 | } \ |
178 | \ |
179 | inptr += 4; \ |
180 | } |
181 | #define LOOP_NEED_FLAGS |
182 | #define \ |
183 | , int swap |
184 | #include <iconv/loop.c> |
185 | |
186 | |
187 | /* Convert from UCS2 to the internal (UCS4-like) format. */ |
188 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
189 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
190 | #define LOOPFCT FROM_LOOP |
191 | #define BODY \ |
192 | { \ |
193 | uint16_t u1 = get16 (inptr); \ |
194 | \ |
195 | if (swap) \ |
196 | u1 = bswap_16 (u1); \ |
197 | \ |
198 | if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \ |
199 | { \ |
200 | /* Surrogate characters in UCS-2 input are not valid. Reject \ |
201 | them. (Catching this here is not security relevant.) */ \ |
202 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ |
203 | } \ |
204 | \ |
205 | put32 (outptr, u1); \ |
206 | \ |
207 | inptr += 2; \ |
208 | outptr += 4; \ |
209 | } |
210 | #define LOOP_NEED_FLAGS |
211 | #define \ |
212 | , int swap |
213 | #include <iconv/loop.c> |
214 | |
215 | |
216 | /* Now define the toplevel functions. */ |
217 | #include <iconv/skeleton.c> |
218 | |