1 | /* Copyright (C) 1995-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library; if not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | |
19 | #include <assert.h> |
20 | #include <langinfo.h> |
21 | #include <locale.h> |
22 | #include <stddef.h> |
23 | #include <stdint.h> |
24 | #include <string.h> |
25 | #include <sys/param.h> |
26 | #include <libc-diag.h> |
27 | |
28 | #ifndef STRING_TYPE |
29 | # define STRING_TYPE char |
30 | # define USTRING_TYPE unsigned char |
31 | # define STRCOLL __strcoll_l |
32 | # define STRCMP strcmp |
33 | # define WEIGHT_H "../locale/weight.h" |
34 | # define SUFFIX MB |
35 | # define L(arg) arg |
36 | #endif |
37 | |
38 | #define CONCAT(a,b) CONCAT1(a,b) |
39 | #define CONCAT1(a,b) a##b |
40 | |
41 | #include "../locale/localeinfo.h" |
42 | #include WEIGHT_H |
43 | |
44 | /* Track status while looking for sequences in a string. */ |
45 | typedef struct |
46 | { |
47 | int len; /* Length of the current sequence. */ |
48 | size_t val; /* Position of the sequence relative to the |
49 | previous non-ignored sequence. */ |
50 | size_t idxmax; /* Maximum index in sequences. */ |
51 | size_t idxcnt; /* Current count of indices. */ |
52 | size_t backw; /* Current Backward sequence index. */ |
53 | size_t backw_stop; /* Index where the backward sequences stop. */ |
54 | const USTRING_TYPE *us; /* The string. */ |
55 | unsigned char rule; /* Saved rule for the first sequence. */ |
56 | int32_t idx; /* Index to weight of the current sequence. */ |
57 | int32_t save_idx; /* Save looked up index of a forward |
58 | sequence after the last backward |
59 | sequence. */ |
60 | const USTRING_TYPE *back_us; /* Beginning of the backward sequence. */ |
61 | } coll_seq; |
62 | |
63 | /* Get next sequence. Traverse the string as required. */ |
64 | static __always_inline void |
65 | get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, |
66 | const USTRING_TYPE *weights, const int32_t *table, |
67 | const USTRING_TYPE *, const int32_t *indirect, |
68 | int pass) |
69 | { |
70 | size_t val = seq->val = 0; |
71 | int len = seq->len; |
72 | size_t backw_stop = seq->backw_stop; |
73 | size_t backw = seq->backw; |
74 | size_t idxcnt = seq->idxcnt; |
75 | size_t idxmax = seq->idxmax; |
76 | int32_t idx = seq->idx; |
77 | const USTRING_TYPE *us = seq->us; |
78 | |
79 | while (len == 0) |
80 | { |
81 | ++val; |
82 | if (backw_stop != ~0ul) |
83 | { |
84 | /* There is something pushed. */ |
85 | if (backw == backw_stop) |
86 | { |
87 | /* The last pushed character was handled. Continue |
88 | with forward characters. */ |
89 | if (idxcnt < idxmax) |
90 | { |
91 | idx = seq->save_idx; |
92 | backw_stop = ~0ul; |
93 | } |
94 | else |
95 | { |
96 | /* Nothing anymore. The backward sequence ended with |
97 | the last sequence in the string. Note that len is |
98 | still zero. */ |
99 | idx = 0; |
100 | break; |
101 | } |
102 | } |
103 | else |
104 | { |
105 | /* XXX Traverse BACKW sequences from the beginning of |
106 | BACKW_STOP to get the next sequence. Is ther a quicker way |
107 | to do this? */ |
108 | size_t i = backw_stop; |
109 | us = seq->back_us; |
110 | while (i < backw) |
111 | { |
112 | int32_t tmp = findidx (table, indirect, extra, &us, -1); |
113 | idx = tmp & 0xffffff; |
114 | i++; |
115 | } |
116 | --backw; |
117 | us = seq->us; |
118 | } |
119 | } |
120 | else |
121 | { |
122 | backw_stop = idxmax; |
123 | int32_t prev_idx = idx; |
124 | |
125 | while (*us != L('\0')) |
126 | { |
127 | int32_t tmp = findidx (table, indirect, extra, &us, -1); |
128 | unsigned char rule = tmp >> 24; |
129 | prev_idx = idx; |
130 | idx = tmp & 0xffffff; |
131 | idxcnt = idxmax++; |
132 | |
133 | /* Save the rule for the first sequence. */ |
134 | if (__glibc_unlikely (idxcnt == 0)) |
135 | seq->rule = rule; |
136 | |
137 | if ((rulesets[rule * nrules + pass] |
138 | & sort_backward) == 0) |
139 | /* No more backward characters to push. */ |
140 | break; |
141 | ++idxcnt; |
142 | } |
143 | |
144 | if (backw_stop >= idxcnt) |
145 | { |
146 | /* No sequence at all or just one. */ |
147 | if (idxcnt == idxmax || backw_stop > idxcnt) |
148 | /* Note that len is still zero. */ |
149 | break; |
150 | |
151 | backw_stop = ~0ul; |
152 | } |
153 | else |
154 | { |
155 | /* We pushed backward sequences. If the stream ended with the |
156 | backward sequence, then we process the last sequence we |
157 | found. Otherwise we process the sequence before the last |
158 | one since the last one was a forward sequence. */ |
159 | seq->back_us = seq->us; |
160 | seq->us = us; |
161 | backw = idxcnt; |
162 | if (idxmax > idxcnt) |
163 | { |
164 | backw--; |
165 | seq->save_idx = idx; |
166 | idx = prev_idx; |
167 | } |
168 | if (backw > backw_stop) |
169 | backw--; |
170 | } |
171 | } |
172 | |
173 | /* With GCC 5.3 when compiling with -Os the compiler complains |
174 | that idx, taken from seq->idx (seq1 or seq2 from STRCOLL) may |
175 | be used uninitialized. In general this can't possibly be true |
176 | since seq1.idx and seq2.idx are initialized to zero in the |
177 | outer function. Only one case where seq->idx is restored from |
178 | seq->save_idx might result in an uninitialized idx value, but |
179 | it is guarded by a sequence of checks against backw_stop which |
180 | ensures that seq->save_idx was saved to first and contains a |
181 | valid value. */ |
182 | DIAG_PUSH_NEEDS_COMMENT; |
183 | DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized" ); |
184 | len = weights[idx++]; |
185 | DIAG_POP_NEEDS_COMMENT; |
186 | /* Skip over indices of previous levels. */ |
187 | for (int i = 0; i < pass; i++) |
188 | { |
189 | idx += len; |
190 | len = weights[idx]; |
191 | idx++; |
192 | } |
193 | } |
194 | |
195 | /* Update the structure. */ |
196 | seq->val = val; |
197 | seq->len = len; |
198 | seq->backw_stop = backw_stop; |
199 | seq->backw = backw; |
200 | seq->idxcnt = idxcnt; |
201 | seq->idxmax = idxmax; |
202 | seq->us = us; |
203 | seq->idx = idx; |
204 | } |
205 | |
206 | /* Compare two sequences. */ |
207 | static __always_inline int |
208 | do_compare (coll_seq *seq1, coll_seq *seq2, int position, |
209 | const USTRING_TYPE *weights) |
210 | { |
211 | int seq1len = seq1->len; |
212 | int seq2len = seq2->len; |
213 | size_t val1 = seq1->val; |
214 | size_t val2 = seq2->val; |
215 | int idx1 = seq1->idx; |
216 | int idx2 = seq2->idx; |
217 | int result = 0; |
218 | |
219 | /* Test for position if necessary. */ |
220 | if (position && val1 != val2) |
221 | { |
222 | result = val1 > val2 ? 1 : -1; |
223 | goto out; |
224 | } |
225 | |
226 | /* Compare the two sequences. */ |
227 | do |
228 | { |
229 | if (weights[idx1] != weights[idx2]) |
230 | { |
231 | /* The sequences differ. */ |
232 | result = weights[idx1] - weights[idx2]; |
233 | goto out; |
234 | } |
235 | |
236 | /* Increment the offsets. */ |
237 | ++idx1; |
238 | ++idx2; |
239 | |
240 | --seq1len; |
241 | --seq2len; |
242 | } |
243 | while (seq1len > 0 && seq2len > 0); |
244 | |
245 | if (position && seq1len != seq2len) |
246 | result = seq1len - seq2len; |
247 | |
248 | out: |
249 | seq1->len = seq1len; |
250 | seq2->len = seq2len; |
251 | seq1->idx = idx1; |
252 | seq2->idx = idx2; |
253 | return result; |
254 | } |
255 | |
256 | int |
257 | STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, locale_t l) |
258 | { |
259 | struct __locale_data *current = l->__locales[LC_COLLATE]; |
260 | uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word; |
261 | /* We don't assign the following values right away since it might be |
262 | unnecessary in case there are no rules. */ |
263 | const unsigned char *rulesets; |
264 | const int32_t *table; |
265 | const USTRING_TYPE *weights; |
266 | const USTRING_TYPE *; |
267 | const int32_t *indirect; |
268 | |
269 | if (nrules == 0) |
270 | return STRCMP (s1, s2); |
271 | |
272 | /* Catch empty strings. */ |
273 | if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0')) |
274 | return (*s1 != '\0') - (*s2 != '\0'); |
275 | |
276 | rulesets = (const unsigned char *) |
277 | current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string; |
278 | table = (const int32_t *) |
279 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string; |
280 | weights = (const USTRING_TYPE *) |
281 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string; |
282 | extra = (const USTRING_TYPE *) |
283 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string; |
284 | indirect = (const int32_t *) |
285 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string; |
286 | |
287 | assert (((uintptr_t) table) % __alignof__ (table[0]) == 0); |
288 | assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0); |
289 | assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0); |
290 | assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0); |
291 | |
292 | int result = 0, rule = 0; |
293 | |
294 | /* With GCC 7 when compiling with -Os the compiler warns that |
295 | seq1.back_us and seq2.back_us might be used uninitialized. |
296 | Sometimes this warning appears at locations in locale/weightwc.h |
297 | where the actual use is, but on architectures other than x86_64, |
298 | x86 and s390x, a warning appears at the definitions of seq1 and |
299 | seq2. This uninitialized use is impossible for the same reason |
300 | as described in comments in locale/weightwc.h. */ |
301 | DIAG_PUSH_NEEDS_COMMENT; |
302 | DIAG_IGNORE_Os_NEEDS_COMMENT (7, "-Wmaybe-uninitialized" ); |
303 | coll_seq seq1, seq2; |
304 | DIAG_POP_NEEDS_COMMENT; |
305 | seq1.len = 0; |
306 | seq1.idxmax = 0; |
307 | seq1.rule = 0; |
308 | seq2.len = 0; |
309 | seq2.idxmax = 0; |
310 | |
311 | for (int pass = 0; pass < nrules; ++pass) |
312 | { |
313 | seq1.idxcnt = 0; |
314 | seq1.idx = 0; |
315 | seq2.idx = 0; |
316 | seq1.backw_stop = ~0ul; |
317 | seq1.backw = ~0ul; |
318 | seq2.idxcnt = 0; |
319 | seq2.backw_stop = ~0ul; |
320 | seq2.backw = ~0ul; |
321 | |
322 | /* We need the elements of the strings as unsigned values since they |
323 | are used as indices. */ |
324 | seq1.us = (const USTRING_TYPE *) s1; |
325 | seq2.us = (const USTRING_TYPE *) s2; |
326 | |
327 | /* We assume that if a rule has defined `position' in one section |
328 | this is true for all of them. Please note that the localedef programs |
329 | makes sure that `position' is not used at the first level. */ |
330 | |
331 | int position = rulesets[rule * nrules + pass] & sort_position; |
332 | |
333 | while (1) |
334 | { |
335 | get_next_seq (&seq1, nrules, rulesets, weights, table, |
336 | extra, indirect, pass); |
337 | get_next_seq (&seq2, nrules, rulesets, weights, table, |
338 | extra, indirect, pass); |
339 | /* See whether any or both strings are empty. */ |
340 | if (seq1.len == 0 || seq2.len == 0) |
341 | { |
342 | if (seq1.len == seq2.len) |
343 | { |
344 | /* Both strings ended and are equal at this level. Do a |
345 | byte-level comparison to ensure that we don't waste time |
346 | going through multiple passes for totally equal strings |
347 | before proceeding to subsequent passes. */ |
348 | if (pass == 0 && STRCMP (s1, s2) == 0) |
349 | return result; |
350 | else |
351 | break; |
352 | } |
353 | |
354 | /* This means one string is shorter than the other. Find out |
355 | which one and return an appropriate value. */ |
356 | return seq1.len == 0 ? -1 : 1; |
357 | } |
358 | |
359 | result = do_compare (&seq1, &seq2, position, weights); |
360 | if (result != 0) |
361 | return result; |
362 | } |
363 | |
364 | rule = seq1.rule; |
365 | } |
366 | |
367 | return result; |
368 | } |
369 | libc_hidden_def (STRCOLL) |
370 | |
371 | #ifndef WIDE_CHAR_VERSION |
372 | weak_alias (__strcoll_l, strcoll_l) |
373 | #endif |
374 | |