1 | /* Conversion module for Unicode |
2 | Copyright (C) 1999-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <byteswap.h> |
21 | #include <dlfcn.h> |
22 | #include <gconv.h> |
23 | #include <stddef.h> |
24 | #include <stdint.h> |
25 | #include <stdlib.h> |
26 | #include <string.h> |
27 | |
28 | /* This is the Byte Order Mark character (BOM). */ |
29 | #define BOM 0xfeff |
30 | /* And in the other endian format. */ |
31 | #define BOM_OE 0xfffe |
32 | |
33 | |
34 | /* Definitions used in the body of the `gconv' function. */ |
35 | #define FROM_LOOP from_unicode_loop |
36 | #define TO_LOOP to_unicode_loop |
37 | #define DEFINE_INIT 0 |
38 | #define DEFINE_FINI 0 |
39 | #define MIN_NEEDED_FROM 2 |
40 | #define MIN_NEEDED_TO 4 |
41 | #define ONE_DIRECTION 0 |
42 | #define FROM_DIRECTION (dir == from_unicode) |
43 | #define PREPARE_LOOP \ |
44 | enum direction dir = ((struct unicode_data *) step->__data)->dir; \ |
45 | int swap; \ |
46 | if (FROM_DIRECTION) \ |
47 | { \ |
48 | if (data->__invocation_counter == 0) \ |
49 | { \ |
50 | /* We have to find out which byte order the file is encoded in. */ \ |
51 | if (inptr + 2 > inend) \ |
52 | return (inptr == inend \ |
53 | ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); \ |
54 | \ |
55 | if (get16u (inptr) == BOM) \ |
56 | /* Simply ignore the BOM character. */ \ |
57 | *inptrp = inptr += 2; \ |
58 | else if (get16u (inptr) == BOM_OE) \ |
59 | { \ |
60 | data->__flags |= __GCONV_SWAP; \ |
61 | *inptrp = inptr += 2; \ |
62 | } \ |
63 | } \ |
64 | } \ |
65 | else if (!data->__internal_use && data->__invocation_counter == 0) \ |
66 | { \ |
67 | /* Emit the Byte Order Mark. */ \ |
68 | if (__glibc_unlikely (outbuf + 2 > outend)) \ |
69 | return __GCONV_FULL_OUTPUT; \ |
70 | \ |
71 | put16u (outbuf, BOM); \ |
72 | outbuf += 2; \ |
73 | } \ |
74 | swap = data->__flags & __GCONV_SWAP; |
75 | #define , swap |
76 | |
77 | |
78 | /* Direction of the transformation. */ |
79 | enum direction |
80 | { |
81 | illegal_dir, |
82 | to_unicode, |
83 | from_unicode |
84 | }; |
85 | |
86 | struct unicode_data |
87 | { |
88 | enum direction dir; |
89 | }; |
90 | |
91 | |
92 | extern int gconv_init (struct __gconv_step *step); |
93 | int |
94 | gconv_init (struct __gconv_step *step) |
95 | { |
96 | /* Determine which direction. */ |
97 | struct unicode_data *new_data; |
98 | enum direction dir = illegal_dir; |
99 | int result; |
100 | |
101 | if (strcmp (step->__from_name, "UNICODE//" ) == 0) |
102 | dir = from_unicode; |
103 | else |
104 | dir = to_unicode; |
105 | |
106 | new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data)); |
107 | |
108 | result = __GCONV_NOMEM; |
109 | if (new_data != NULL) |
110 | { |
111 | new_data->dir = dir; |
112 | step->__data = new_data; |
113 | |
114 | if (dir == from_unicode) |
115 | { |
116 | step->__min_needed_from = MIN_NEEDED_FROM; |
117 | step->__max_needed_from = MIN_NEEDED_FROM; |
118 | step->__min_needed_to = MIN_NEEDED_TO; |
119 | step->__max_needed_to = MIN_NEEDED_TO; |
120 | } |
121 | else |
122 | { |
123 | step->__min_needed_from = MIN_NEEDED_TO; |
124 | step->__max_needed_from = MIN_NEEDED_TO; |
125 | step->__min_needed_to = MIN_NEEDED_FROM; |
126 | step->__max_needed_to = MIN_NEEDED_FROM; |
127 | } |
128 | |
129 | step->__stateful = 0; |
130 | |
131 | result = __GCONV_OK; |
132 | } |
133 | |
134 | return result; |
135 | } |
136 | |
137 | |
138 | extern void gconv_end (struct __gconv_step *data); |
139 | void |
140 | gconv_end (struct __gconv_step *data) |
141 | { |
142 | free (data->__data); |
143 | } |
144 | |
145 | |
146 | /* Convert from the internal (UCS4-like) format to UCS2. */ |
147 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO |
148 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM |
149 | #define LOOPFCT TO_LOOP |
150 | #define BODY \ |
151 | { \ |
152 | uint32_t c = get32 (inptr); \ |
153 | \ |
154 | if (__glibc_unlikely (c >= 0x10000)) \ |
155 | { \ |
156 | UNICODE_TAG_HANDLER (c, 4); \ |
157 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
158 | } \ |
159 | else if (__glibc_unlikely (c >= 0xd800 && c < 0xe000)) \ |
160 | { \ |
161 | /* Surrogate characters in UCS-4 input are not valid. \ |
162 | We must catch this, because the UCS-2 output might be \ |
163 | interpreted as UTF-16 by other programs. If we let \ |
164 | surrogates pass through, attackers could make a security \ |
165 | hole exploit by synthesizing any desired plane 1-16 \ |
166 | character. */ \ |
167 | result = __GCONV_ILLEGAL_INPUT; \ |
168 | if (! ignore_errors_p ()) \ |
169 | break; \ |
170 | inptr += 4; \ |
171 | ++*irreversible; \ |
172 | continue; \ |
173 | } \ |
174 | else \ |
175 | { \ |
176 | put16 (outptr, c); \ |
177 | outptr += 2; \ |
178 | } \ |
179 | \ |
180 | inptr += 4; \ |
181 | } |
182 | #define LOOP_NEED_FLAGS |
183 | #define \ |
184 | , int swap |
185 | #include <iconv/loop.c> |
186 | |
187 | |
188 | /* Convert from UCS2 to the internal (UCS4-like) format. */ |
189 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
190 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
191 | #define LOOPFCT FROM_LOOP |
192 | #define BODY \ |
193 | { \ |
194 | uint16_t u1 = get16 (inptr); \ |
195 | \ |
196 | if (swap) \ |
197 | u1 = bswap_16 (u1); \ |
198 | \ |
199 | if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \ |
200 | { \ |
201 | /* Surrogate characters in UCS-2 input are not valid. Reject \ |
202 | them. (Catching this here is not security relevant.) */ \ |
203 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ |
204 | } \ |
205 | \ |
206 | put32 (outptr, u1); \ |
207 | \ |
208 | inptr += 2; \ |
209 | outptr += 4; \ |
210 | } |
211 | #define LOOP_NEED_FLAGS |
212 | #define \ |
213 | , int swap |
214 | #include <iconv/loop.c> |
215 | |
216 | |
217 | /* Now define the toplevel functions. */ |
218 | #include <iconv/skeleton.c> |
219 | |