1 | /* Byte-wise substring search, using the Two-Way algorithm. |
2 | Copyright (C) 2008-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | Written by Eric Blake <ebb9@byu.net>, 2008. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | /* Before including this file, you need to include <string.h> (and |
21 | <config.h> before that, if not part of libc), and define: |
22 | RETURN_TYPE A macro that expands to the return type. |
23 | AVAILABLE(h, h_l, j, n_l) |
24 | A macro that returns nonzero if there are |
25 | at least N_L bytes left starting at H[J]. |
26 | H is 'unsigned char *', H_L, J, and N_L |
27 | are 'size_t'; H_L is an lvalue. For |
28 | NUL-terminated searches, H_L can be |
29 | modified each iteration to avoid having |
30 | to compute the end of H up front. |
31 | |
32 | For case-insensitivity, you may optionally define: |
33 | CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L |
34 | characters of P1 and P2 are equal. |
35 | CANON_ELEMENT(c) A macro that canonicalizes an element right after |
36 | it has been fetched from one of the two strings. |
37 | The argument is an 'unsigned char'; the result |
38 | must be an 'unsigned char' as well. |
39 | |
40 | Other macros you may optionally define: |
41 | RET0_IF_0(a) Documented below at default definition. |
42 | CHECK_EOL Same. |
43 | |
44 | This file undefines the macros listed above, and defines |
45 | LONG_NEEDLE_THRESHOLD. |
46 | */ |
47 | |
48 | #include <limits.h> |
49 | #include <stdint.h> |
50 | #include <sys/param.h> /* Defines MAX. */ |
51 | |
52 | /* We use the Two-Way string matching algorithm, which guarantees |
53 | linear complexity with constant space. Additionally, for long |
54 | needles, we also use a bad character shift table similar to the |
55 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) |
56 | performance. |
57 | |
58 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 |
59 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm |
60 | */ |
61 | |
62 | /* Point at which computing a bad-byte shift table is likely to be |
63 | worthwhile. Small needles should not compute a table, since it |
64 | adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a |
65 | speedup no greater than a factor of NEEDLE_LEN. The larger the |
66 | needle, the better the potential performance gain. On the other |
67 | hand, on non-POSIX systems with CHAR_BIT larger than eight, the |
68 | memory required for the table is prohibitive. */ |
69 | #if CHAR_BIT < 10 |
70 | # define LONG_NEEDLE_THRESHOLD 32U |
71 | #else |
72 | # define LONG_NEEDLE_THRESHOLD SIZE_MAX |
73 | #endif |
74 | |
75 | #ifndef CANON_ELEMENT |
76 | # define CANON_ELEMENT(c) c |
77 | #endif |
78 | #ifndef CMP_FUNC |
79 | # define CMP_FUNC memcmp |
80 | #endif |
81 | |
82 | /* Check for end-of-line in strstr and strcasestr routines. |
83 | We piggy-back matching procedure for detecting EOL where possible, |
84 | and use AVAILABLE macro otherwise. */ |
85 | #ifndef CHECK_EOL |
86 | # define CHECK_EOL (0) |
87 | #endif |
88 | |
89 | /* Return NULL if argument is '\0'. */ |
90 | #ifndef RET0_IF_0 |
91 | # define RET0_IF_0(a) /* nothing */ |
92 | #endif |
93 | |
94 | /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. |
95 | Return the index of the first byte in the right half, and set |
96 | *PERIOD to the global period of the right half. |
97 | |
98 | The global period of a string is the smallest index (possibly its |
99 | length) at which all remaining bytes in the string are repetitions |
100 | of the prefix (the last repetition may be a subset of the prefix). |
101 | |
102 | When NEEDLE is factored into two halves, a local period is the |
103 | length of the smallest word that shares a suffix with the left half |
104 | and shares a prefix with the right half. All factorizations of a |
105 | non-empty NEEDLE have a local period of at least 1 and no greater |
106 | than NEEDLE_LEN. |
107 | |
108 | A critical factorization has the property that the local period |
109 | equals the global period. All strings have at least one critical |
110 | factorization with the left half smaller than the global period. |
111 | |
112 | Given an ordered alphabet, a critical factorization can be computed |
113 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
114 | larger of two ordered maximal suffixes. The ordered maximal |
115 | suffixes are determined by lexicographic comparison of |
116 | periodicity. */ |
117 | static size_t |
118 | critical_factorization (const unsigned char *needle, size_t needle_len, |
119 | size_t *period) |
120 | { |
121 | /* Index of last byte of left half, or SIZE_MAX. */ |
122 | size_t max_suffix, max_suffix_rev; |
123 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
124 | size_t k; /* Offset into current period. */ |
125 | size_t p; /* Intermediate period. */ |
126 | unsigned char a, b; /* Current comparison bytes. */ |
127 | |
128 | /* Invariants: |
129 | 0 <= j < NEEDLE_LEN - 1 |
130 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
131 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
132 | 1 <= p <= global period of NEEDLE |
133 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
134 | 1 <= k <= p |
135 | */ |
136 | |
137 | /* Perform lexicographic search. */ |
138 | max_suffix = SIZE_MAX; |
139 | j = 0; |
140 | k = p = 1; |
141 | while (j + k < needle_len) |
142 | { |
143 | a = CANON_ELEMENT (needle[j + k]); |
144 | b = CANON_ELEMENT (needle[max_suffix + k]); |
145 | if (a < b) |
146 | { |
147 | /* Suffix is smaller, period is entire prefix so far. */ |
148 | j += k; |
149 | k = 1; |
150 | p = j - max_suffix; |
151 | } |
152 | else if (a == b) |
153 | { |
154 | /* Advance through repetition of the current period. */ |
155 | if (k != p) |
156 | ++k; |
157 | else |
158 | { |
159 | j += p; |
160 | k = 1; |
161 | } |
162 | } |
163 | else /* b < a */ |
164 | { |
165 | /* Suffix is larger, start over from current location. */ |
166 | max_suffix = j++; |
167 | k = p = 1; |
168 | } |
169 | } |
170 | *period = p; |
171 | |
172 | /* Perform reverse lexicographic search. */ |
173 | max_suffix_rev = SIZE_MAX; |
174 | j = 0; |
175 | k = p = 1; |
176 | while (j + k < needle_len) |
177 | { |
178 | a = CANON_ELEMENT (needle[j + k]); |
179 | b = CANON_ELEMENT (needle[max_suffix_rev + k]); |
180 | if (b < a) |
181 | { |
182 | /* Suffix is smaller, period is entire prefix so far. */ |
183 | j += k; |
184 | k = 1; |
185 | p = j - max_suffix_rev; |
186 | } |
187 | else if (a == b) |
188 | { |
189 | /* Advance through repetition of the current period. */ |
190 | if (k != p) |
191 | ++k; |
192 | else |
193 | { |
194 | j += p; |
195 | k = 1; |
196 | } |
197 | } |
198 | else /* a < b */ |
199 | { |
200 | /* Suffix is larger, start over from current location. */ |
201 | max_suffix_rev = j++; |
202 | k = p = 1; |
203 | } |
204 | } |
205 | |
206 | /* Choose the longer suffix. Return the first byte of the right |
207 | half, rather than the last byte of the left half. */ |
208 | if (max_suffix_rev + 1 < max_suffix + 1) |
209 | return max_suffix + 1; |
210 | *period = p; |
211 | return max_suffix_rev + 1; |
212 | } |
213 | |
214 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
215 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
216 | method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. |
217 | Performance is guaranteed to be linear, with an initialization cost |
218 | of 2 * NEEDLE_LEN comparisons. |
219 | |
220 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
221 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. |
222 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
223 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ |
224 | static inline RETURN_TYPE |
225 | two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
226 | const unsigned char *needle, size_t needle_len) |
227 | { |
228 | size_t i; /* Index into current byte of NEEDLE. */ |
229 | size_t j; /* Index into current window of HAYSTACK. */ |
230 | size_t period; /* The period of the right half of needle. */ |
231 | size_t suffix; /* The index of the right half of needle. */ |
232 | |
233 | /* Factor the needle into two halves, such that the left half is |
234 | smaller than the global period, and the right half is |
235 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
236 | suffix = critical_factorization (needle, needle_len, &period); |
237 | |
238 | /* Perform the search. Each iteration compares the right half |
239 | first. */ |
240 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
241 | { |
242 | /* Entire needle is periodic; a mismatch can only advance by the |
243 | period, so use memory to avoid rescanning known occurrences |
244 | of the period. */ |
245 | size_t memory = 0; |
246 | j = 0; |
247 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
248 | { |
249 | const unsigned char *pneedle; |
250 | const unsigned char *phaystack; |
251 | |
252 | /* Scan for matches in right half. */ |
253 | i = MAX (suffix, memory); |
254 | pneedle = &needle[i]; |
255 | phaystack = &haystack[i + j]; |
256 | while (i < needle_len && (CANON_ELEMENT (*pneedle++) |
257 | == CANON_ELEMENT (*phaystack++))) |
258 | ++i; |
259 | if (needle_len <= i) |
260 | { |
261 | /* Scan for matches in left half. */ |
262 | i = suffix - 1; |
263 | pneedle = &needle[i]; |
264 | phaystack = &haystack[i + j]; |
265 | while (memory < i + 1 && (CANON_ELEMENT (*pneedle--) |
266 | == CANON_ELEMENT (*phaystack--))) |
267 | --i; |
268 | if (i + 1 < memory + 1) |
269 | return (RETURN_TYPE) (haystack + j); |
270 | /* No match, so remember how many repetitions of period |
271 | on the right half were scanned. */ |
272 | j += period; |
273 | memory = needle_len - period; |
274 | } |
275 | else |
276 | { |
277 | j += i - suffix + 1; |
278 | memory = 0; |
279 | } |
280 | } |
281 | } |
282 | else |
283 | { |
284 | const unsigned char *phaystack; |
285 | /* The comparison always starts from needle[suffix], so cache it |
286 | and use an optimized first-character loop. */ |
287 | unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]); |
288 | |
289 | /* The two halves of needle are distinct; no extra memory is |
290 | required, and any mismatch results in a maximal shift. */ |
291 | period = MAX (suffix, needle_len - suffix) + 1; |
292 | j = 0; |
293 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
294 | { |
295 | unsigned char haystack_char; |
296 | const unsigned char *pneedle; |
297 | |
298 | phaystack = &haystack[suffix + j]; |
299 | |
300 | #ifdef FASTSEARCH |
301 | if (*phaystack++ != needle_suffix) |
302 | { |
303 | phaystack = FASTSEARCH (phaystack, needle_suffix, |
304 | haystack_len - needle_len - j); |
305 | if (phaystack == NULL) |
306 | goto ret0; |
307 | j = phaystack - &haystack[suffix]; |
308 | phaystack++; |
309 | } |
310 | #else |
311 | while (needle_suffix |
312 | != (haystack_char = CANON_ELEMENT (*phaystack++))) |
313 | { |
314 | RET0_IF_0 (haystack_char); |
315 | # if !CHECK_EOL |
316 | ++j; |
317 | if (!AVAILABLE (haystack, haystack_len, j, needle_len)) |
318 | goto ret0; |
319 | # endif |
320 | } |
321 | |
322 | # if CHECK_EOL |
323 | /* Calculate J if it wasn't kept up-to-date in the first-character |
324 | loop. */ |
325 | j = phaystack - &haystack[suffix] - 1; |
326 | # endif |
327 | #endif |
328 | /* Scan for matches in right half. */ |
329 | i = suffix + 1; |
330 | pneedle = &needle[i]; |
331 | while (i < needle_len) |
332 | { |
333 | if (CANON_ELEMENT (*pneedle++) |
334 | != (haystack_char = CANON_ELEMENT (*phaystack++))) |
335 | { |
336 | RET0_IF_0 (haystack_char); |
337 | break; |
338 | } |
339 | ++i; |
340 | } |
341 | #if CHECK_EOL |
342 | /* Update minimal length of haystack. */ |
343 | if (phaystack > haystack + haystack_len) |
344 | haystack_len = phaystack - haystack; |
345 | #endif |
346 | if (needle_len <= i) |
347 | { |
348 | /* Scan for matches in left half. */ |
349 | i = suffix - 1; |
350 | pneedle = &needle[i]; |
351 | phaystack = &haystack[i + j]; |
352 | while (i != SIZE_MAX) |
353 | { |
354 | if (CANON_ELEMENT (*pneedle--) |
355 | != (haystack_char = CANON_ELEMENT (*phaystack--))) |
356 | { |
357 | RET0_IF_0 (haystack_char); |
358 | break; |
359 | } |
360 | --i; |
361 | } |
362 | if (i == SIZE_MAX) |
363 | return (RETURN_TYPE) (haystack + j); |
364 | j += period; |
365 | } |
366 | else |
367 | j += i - suffix + 1; |
368 | } |
369 | } |
370 | ret0: __attribute__ ((unused)) |
371 | return NULL; |
372 | } |
373 | |
374 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
375 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
376 | method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. |
377 | Performance is guaranteed to be linear, with an initialization cost |
378 | of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. |
379 | |
380 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
381 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, |
382 | and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. |
383 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
384 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and |
385 | sublinear performance is not possible. |
386 | |
387 | Since this function is large and complex, block inlining to avoid |
388 | slowing down the common case of small needles. */ |
389 | __attribute__((noinline)) static RETURN_TYPE |
390 | two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
391 | const unsigned char *needle, size_t needle_len) |
392 | { |
393 | size_t i; /* Index into current byte of NEEDLE. */ |
394 | size_t j; /* Index into current window of HAYSTACK. */ |
395 | size_t period; /* The period of the right half of needle. */ |
396 | size_t suffix; /* The index of the right half of needle. */ |
397 | size_t shift_table[1U << CHAR_BIT]; /* See below. */ |
398 | |
399 | /* Factor the needle into two halves, such that the left half is |
400 | smaller than the global period, and the right half is |
401 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
402 | suffix = critical_factorization (needle, needle_len, &period); |
403 | |
404 | /* Populate shift_table. For each possible byte value c, |
405 | shift_table[c] is the distance from the last occurrence of c to |
406 | the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. |
407 | shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ |
408 | for (i = 0; i < 1U << CHAR_BIT; i++) |
409 | shift_table[i] = needle_len; |
410 | for (i = 0; i < needle_len; i++) |
411 | shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; |
412 | |
413 | /* Perform the search. Each iteration compares the right half |
414 | first. */ |
415 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
416 | { |
417 | /* Entire needle is periodic; a mismatch can only advance by the |
418 | period, so use memory to avoid rescanning known occurrences |
419 | of the period. */ |
420 | size_t memory = 0; |
421 | size_t shift; |
422 | j = 0; |
423 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
424 | { |
425 | const unsigned char *pneedle; |
426 | const unsigned char *phaystack; |
427 | |
428 | /* Check the last byte first; if it does not match, then |
429 | shift to the next possible match location. */ |
430 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
431 | if (0 < shift) |
432 | { |
433 | if (memory && shift < period) |
434 | { |
435 | /* Since needle is periodic, but the last period has |
436 | a byte out of place, there can be no match until |
437 | after the mismatch. */ |
438 | shift = needle_len - period; |
439 | } |
440 | memory = 0; |
441 | j += shift; |
442 | continue; |
443 | } |
444 | /* Scan for matches in right half. The last byte has |
445 | already been matched, by virtue of the shift table. */ |
446 | i = MAX (suffix, memory); |
447 | pneedle = &needle[i]; |
448 | phaystack = &haystack[i + j]; |
449 | while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++) |
450 | == CANON_ELEMENT (*phaystack++))) |
451 | ++i; |
452 | if (needle_len - 1 <= i) |
453 | { |
454 | /* Scan for matches in left half. */ |
455 | i = suffix - 1; |
456 | pneedle = &needle[i]; |
457 | phaystack = &haystack[i + j]; |
458 | while (memory < i + 1 && (CANON_ELEMENT (*pneedle--) |
459 | == CANON_ELEMENT (*phaystack--))) |
460 | --i; |
461 | if (i + 1 < memory + 1) |
462 | return (RETURN_TYPE) (haystack + j); |
463 | /* No match, so remember how many repetitions of period |
464 | on the right half were scanned. */ |
465 | j += period; |
466 | memory = needle_len - period; |
467 | } |
468 | else |
469 | { |
470 | j += i - suffix + 1; |
471 | memory = 0; |
472 | } |
473 | } |
474 | } |
475 | else |
476 | { |
477 | /* The two halves of needle are distinct; no extra memory is |
478 | required, and any mismatch results in a maximal shift. */ |
479 | size_t shift; |
480 | period = MAX (suffix, needle_len - suffix) + 1; |
481 | j = 0; |
482 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
483 | { |
484 | const unsigned char *pneedle; |
485 | const unsigned char *phaystack; |
486 | |
487 | /* Check the last byte first; if it does not match, then |
488 | shift to the next possible match location. */ |
489 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
490 | if (0 < shift) |
491 | { |
492 | j += shift; |
493 | continue; |
494 | } |
495 | /* Scan for matches in right half. The last byte has |
496 | already been matched, by virtue of the shift table. */ |
497 | i = suffix; |
498 | pneedle = &needle[i]; |
499 | phaystack = &haystack[i + j]; |
500 | while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++) |
501 | == CANON_ELEMENT (*phaystack++))) |
502 | ++i; |
503 | if (needle_len - 1 <= i) |
504 | { |
505 | /* Scan for matches in left half. */ |
506 | i = suffix - 1; |
507 | pneedle = &needle[i]; |
508 | phaystack = &haystack[i + j]; |
509 | while (i != SIZE_MAX && (CANON_ELEMENT (*pneedle--) |
510 | == CANON_ELEMENT (*phaystack--))) |
511 | --i; |
512 | if (i == SIZE_MAX) |
513 | return (RETURN_TYPE) (haystack + j); |
514 | j += period; |
515 | } |
516 | else |
517 | j += i - suffix + 1; |
518 | } |
519 | } |
520 | return NULL; |
521 | } |
522 | |
523 | #undef AVAILABLE |
524 | #undef CANON_ELEMENT |
525 | #undef CMP_FUNC |
526 | #undef RET0_IF_0 |
527 | #undef RETURN_TYPE |
528 | #undef CHECK_EOL |
529 | |