1 | /* Byte-wise substring search, using the Two-Way algorithm. |
2 | Copyright (C) 2008-2018 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | Written by Eric Blake <ebb9@byu.net>, 2008. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ |
19 | |
20 | /* Before including this file, you need to include <string.h> (and |
21 | <config.h> before that, if not part of libc), and define: |
22 | RETURN_TYPE A macro that expands to the return type. |
23 | AVAILABLE(h, h_l, j, n_l) |
24 | A macro that returns nonzero if there are |
25 | at least N_L bytes left starting at H[J]. |
26 | H is 'unsigned char *', H_L, J, and N_L |
27 | are 'size_t'; H_L is an lvalue. For |
28 | NUL-terminated searches, H_L can be |
29 | modified each iteration to avoid having |
30 | to compute the end of H up front. |
31 | |
32 | For case-insensitivity, you may optionally define: |
33 | CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L |
34 | characters of P1 and P2 are equal. |
35 | CANON_ELEMENT(c) A macro that canonicalizes an element right after |
36 | it has been fetched from one of the two strings. |
37 | The argument is an 'unsigned char'; the result |
38 | must be an 'unsigned char' as well. |
39 | |
40 | Other macros you may optionally define: |
41 | RET0_IF_0(a) Documented below at default definition. |
42 | CHECK_EOL Same. |
43 | |
44 | This file undefines the macros listed above, and defines |
45 | LONG_NEEDLE_THRESHOLD. |
46 | */ |
47 | |
48 | #include <limits.h> |
49 | #include <stdint.h> |
50 | #include <sys/param.h> /* Defines MAX. */ |
51 | |
52 | /* We use the Two-Way string matching algorithm, which guarantees |
53 | linear complexity with constant space. Additionally, for long |
54 | needles, we also use a bad character shift table similar to the |
55 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) |
56 | performance. |
57 | |
58 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 |
59 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm |
60 | */ |
61 | |
62 | /* Point at which computing a bad-byte shift table is likely to be |
63 | worthwhile. Small needles should not compute a table, since it |
64 | adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a |
65 | speedup no greater than a factor of NEEDLE_LEN. The larger the |
66 | needle, the better the potential performance gain. On the other |
67 | hand, on non-POSIX systems with CHAR_BIT larger than eight, the |
68 | memory required for the table is prohibitive. */ |
69 | #if CHAR_BIT < 10 |
70 | # define LONG_NEEDLE_THRESHOLD 32U |
71 | #else |
72 | # define LONG_NEEDLE_THRESHOLD SIZE_MAX |
73 | #endif |
74 | |
75 | #ifndef CANON_ELEMENT |
76 | # define CANON_ELEMENT(c) c |
77 | #endif |
78 | #ifndef CMP_FUNC |
79 | # define CMP_FUNC memcmp |
80 | #endif |
81 | |
82 | /* Check for end-of-line in strstr and strcasestr routines. |
83 | We piggy-back matching procedure for detecting EOL where possible, |
84 | and use AVAILABLE macro otherwise. */ |
85 | #ifndef CHECK_EOL |
86 | # define CHECK_EOL (0) |
87 | #endif |
88 | |
89 | /* Return NULL if argument is '\0'. */ |
90 | #ifndef RET0_IF_0 |
91 | # define RET0_IF_0(a) /* nothing */ |
92 | #endif |
93 | |
94 | /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. |
95 | Return the index of the first byte in the right half, and set |
96 | *PERIOD to the global period of the right half. |
97 | |
98 | The global period of a string is the smallest index (possibly its |
99 | length) at which all remaining bytes in the string are repetitions |
100 | of the prefix (the last repetition may be a subset of the prefix). |
101 | |
102 | When NEEDLE is factored into two halves, a local period is the |
103 | length of the smallest word that shares a suffix with the left half |
104 | and shares a prefix with the right half. All factorizations of a |
105 | non-empty NEEDLE have a local period of at least 1 and no greater |
106 | than NEEDLE_LEN. |
107 | |
108 | A critical factorization has the property that the local period |
109 | equals the global period. All strings have at least one critical |
110 | factorization with the left half smaller than the global period. |
111 | |
112 | Given an ordered alphabet, a critical factorization can be computed |
113 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
114 | larger of two ordered maximal suffixes. The ordered maximal |
115 | suffixes are determined by lexicographic comparison of |
116 | periodicity. */ |
117 | static size_t |
118 | critical_factorization (const unsigned char *needle, size_t needle_len, |
119 | size_t *period) |
120 | { |
121 | /* Index of last byte of left half, or SIZE_MAX. */ |
122 | size_t max_suffix, max_suffix_rev; |
123 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
124 | size_t k; /* Offset into current period. */ |
125 | size_t p; /* Intermediate period. */ |
126 | unsigned char a, b; /* Current comparison bytes. */ |
127 | |
128 | /* Invariants: |
129 | 0 <= j < NEEDLE_LEN - 1 |
130 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
131 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
132 | 1 <= p <= global period of NEEDLE |
133 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
134 | 1 <= k <= p |
135 | */ |
136 | |
137 | /* Perform lexicographic search. */ |
138 | max_suffix = SIZE_MAX; |
139 | j = 0; |
140 | k = p = 1; |
141 | while (j + k < needle_len) |
142 | { |
143 | a = CANON_ELEMENT (needle[j + k]); |
144 | b = CANON_ELEMENT (needle[max_suffix + k]); |
145 | if (a < b) |
146 | { |
147 | /* Suffix is smaller, period is entire prefix so far. */ |
148 | j += k; |
149 | k = 1; |
150 | p = j - max_suffix; |
151 | } |
152 | else if (a == b) |
153 | { |
154 | /* Advance through repetition of the current period. */ |
155 | if (k != p) |
156 | ++k; |
157 | else |
158 | { |
159 | j += p; |
160 | k = 1; |
161 | } |
162 | } |
163 | else /* b < a */ |
164 | { |
165 | /* Suffix is larger, start over from current location. */ |
166 | max_suffix = j++; |
167 | k = p = 1; |
168 | } |
169 | } |
170 | *period = p; |
171 | |
172 | /* Perform reverse lexicographic search. */ |
173 | max_suffix_rev = SIZE_MAX; |
174 | j = 0; |
175 | k = p = 1; |
176 | while (j + k < needle_len) |
177 | { |
178 | a = CANON_ELEMENT (needle[j + k]); |
179 | b = CANON_ELEMENT (needle[max_suffix_rev + k]); |
180 | if (b < a) |
181 | { |
182 | /* Suffix is smaller, period is entire prefix so far. */ |
183 | j += k; |
184 | k = 1; |
185 | p = j - max_suffix_rev; |
186 | } |
187 | else if (a == b) |
188 | { |
189 | /* Advance through repetition of the current period. */ |
190 | if (k != p) |
191 | ++k; |
192 | else |
193 | { |
194 | j += p; |
195 | k = 1; |
196 | } |
197 | } |
198 | else /* a < b */ |
199 | { |
200 | /* Suffix is larger, start over from current location. */ |
201 | max_suffix_rev = j++; |
202 | k = p = 1; |
203 | } |
204 | } |
205 | |
206 | /* Choose the longer suffix. Return the first byte of the right |
207 | half, rather than the last byte of the left half. */ |
208 | if (max_suffix_rev + 1 < max_suffix + 1) |
209 | return max_suffix + 1; |
210 | *period = p; |
211 | return max_suffix_rev + 1; |
212 | } |
213 | |
214 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
215 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
216 | method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. |
217 | Performance is guaranteed to be linear, with an initialization cost |
218 | of 2 * NEEDLE_LEN comparisons. |
219 | |
220 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
221 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. |
222 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
223 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ |
224 | static RETURN_TYPE |
225 | two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
226 | const unsigned char *needle, size_t needle_len) |
227 | { |
228 | size_t i; /* Index into current byte of NEEDLE. */ |
229 | size_t j; /* Index into current window of HAYSTACK. */ |
230 | size_t period; /* The period of the right half of needle. */ |
231 | size_t suffix; /* The index of the right half of needle. */ |
232 | |
233 | /* Factor the needle into two halves, such that the left half is |
234 | smaller than the global period, and the right half is |
235 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
236 | suffix = critical_factorization (needle, needle_len, &period); |
237 | |
238 | /* Perform the search. Each iteration compares the right half |
239 | first. */ |
240 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
241 | { |
242 | /* Entire needle is periodic; a mismatch can only advance by the |
243 | period, so use memory to avoid rescanning known occurrences |
244 | of the period. */ |
245 | size_t memory = 0; |
246 | j = 0; |
247 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
248 | { |
249 | const unsigned char *pneedle; |
250 | const unsigned char *phaystack; |
251 | |
252 | /* Scan for matches in right half. */ |
253 | i = MAX (suffix, memory); |
254 | pneedle = &needle[i]; |
255 | phaystack = &haystack[i + j]; |
256 | while (i < needle_len && (CANON_ELEMENT (*pneedle++) |
257 | == CANON_ELEMENT (*phaystack++))) |
258 | ++i; |
259 | if (needle_len <= i) |
260 | { |
261 | /* Scan for matches in left half. */ |
262 | i = suffix - 1; |
263 | pneedle = &needle[i]; |
264 | phaystack = &haystack[i + j]; |
265 | while (memory < i + 1 && (CANON_ELEMENT (*pneedle--) |
266 | == CANON_ELEMENT (*phaystack--))) |
267 | --i; |
268 | if (i + 1 < memory + 1) |
269 | return (RETURN_TYPE) (haystack + j); |
270 | /* No match, so remember how many repetitions of period |
271 | on the right half were scanned. */ |
272 | j += period; |
273 | memory = needle_len - period; |
274 | } |
275 | else |
276 | { |
277 | j += i - suffix + 1; |
278 | memory = 0; |
279 | } |
280 | } |
281 | } |
282 | else |
283 | { |
284 | const unsigned char *phaystack = &haystack[suffix]; |
285 | /* The comparison always starts from needle[suffix], so cache it |
286 | and use an optimized first-character loop. */ |
287 | unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]); |
288 | |
289 | #if CHECK_EOL |
290 | /* We start matching from the SUFFIX'th element, so make sure we |
291 | don't hit '\0' before that. */ |
292 | if (haystack_len < suffix + 1 |
293 | && !AVAILABLE (haystack, haystack_len, 0, suffix + 1)) |
294 | return NULL; |
295 | #endif |
296 | |
297 | /* The two halves of needle are distinct; no extra memory is |
298 | required, and any mismatch results in a maximal shift. */ |
299 | period = MAX (suffix, needle_len - suffix) + 1; |
300 | j = 0; |
301 | while (1 |
302 | #if !CHECK_EOL |
303 | && AVAILABLE (haystack, haystack_len, j, needle_len) |
304 | #endif |
305 | ) |
306 | { |
307 | unsigned char haystack_char; |
308 | const unsigned char *pneedle; |
309 | |
310 | /* TODO: The first-character loop can be sped up by adapting |
311 | longword-at-a-time implementation of memchr/strchr. */ |
312 | if (needle_suffix |
313 | != (haystack_char = CANON_ELEMENT (*phaystack++))) |
314 | { |
315 | RET0_IF_0 (haystack_char); |
316 | #if !CHECK_EOL |
317 | ++j; |
318 | #endif |
319 | continue; |
320 | } |
321 | |
322 | #if CHECK_EOL |
323 | /* Calculate J if it wasn't kept up-to-date in the first-character |
324 | loop. */ |
325 | j = phaystack - &haystack[suffix] - 1; |
326 | #endif |
327 | |
328 | /* Scan for matches in right half. */ |
329 | i = suffix + 1; |
330 | pneedle = &needle[i]; |
331 | while (i < needle_len) |
332 | { |
333 | if (CANON_ELEMENT (*pneedle++) |
334 | != (haystack_char = CANON_ELEMENT (*phaystack++))) |
335 | { |
336 | RET0_IF_0 (haystack_char); |
337 | break; |
338 | } |
339 | ++i; |
340 | } |
341 | if (needle_len <= i) |
342 | { |
343 | /* Scan for matches in left half. */ |
344 | i = suffix - 1; |
345 | pneedle = &needle[i]; |
346 | phaystack = &haystack[i + j]; |
347 | while (i != SIZE_MAX) |
348 | { |
349 | if (CANON_ELEMENT (*pneedle--) |
350 | != (haystack_char = CANON_ELEMENT (*phaystack--))) |
351 | { |
352 | RET0_IF_0 (haystack_char); |
353 | break; |
354 | } |
355 | --i; |
356 | } |
357 | if (i == SIZE_MAX) |
358 | return (RETURN_TYPE) (haystack + j); |
359 | j += period; |
360 | } |
361 | else |
362 | j += i - suffix + 1; |
363 | |
364 | #if CHECK_EOL |
365 | if (!AVAILABLE (haystack, haystack_len, j, needle_len)) |
366 | break; |
367 | #endif |
368 | |
369 | phaystack = &haystack[suffix + j]; |
370 | } |
371 | } |
372 | ret0: __attribute__ ((unused)) |
373 | return NULL; |
374 | } |
375 | |
376 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
377 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
378 | method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. |
379 | Performance is guaranteed to be linear, with an initialization cost |
380 | of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. |
381 | |
382 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
383 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, |
384 | and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. |
385 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
386 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and |
387 | sublinear performance is not possible. */ |
388 | static RETURN_TYPE |
389 | two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
390 | const unsigned char *needle, size_t needle_len) |
391 | { |
392 | size_t i; /* Index into current byte of NEEDLE. */ |
393 | size_t j; /* Index into current window of HAYSTACK. */ |
394 | size_t period; /* The period of the right half of needle. */ |
395 | size_t suffix; /* The index of the right half of needle. */ |
396 | size_t shift_table[1U << CHAR_BIT]; /* See below. */ |
397 | |
398 | /* Factor the needle into two halves, such that the left half is |
399 | smaller than the global period, and the right half is |
400 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
401 | suffix = critical_factorization (needle, needle_len, &period); |
402 | |
403 | /* Populate shift_table. For each possible byte value c, |
404 | shift_table[c] is the distance from the last occurrence of c to |
405 | the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. |
406 | shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ |
407 | for (i = 0; i < 1U << CHAR_BIT; i++) |
408 | shift_table[i] = needle_len; |
409 | for (i = 0; i < needle_len; i++) |
410 | shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; |
411 | |
412 | /* Perform the search. Each iteration compares the right half |
413 | first. */ |
414 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
415 | { |
416 | /* Entire needle is periodic; a mismatch can only advance by the |
417 | period, so use memory to avoid rescanning known occurrences |
418 | of the period. */ |
419 | size_t memory = 0; |
420 | size_t shift; |
421 | j = 0; |
422 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
423 | { |
424 | const unsigned char *pneedle; |
425 | const unsigned char *phaystack; |
426 | |
427 | /* Check the last byte first; if it does not match, then |
428 | shift to the next possible match location. */ |
429 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
430 | if (0 < shift) |
431 | { |
432 | if (memory && shift < period) |
433 | { |
434 | /* Since needle is periodic, but the last period has |
435 | a byte out of place, there can be no match until |
436 | after the mismatch. */ |
437 | shift = needle_len - period; |
438 | } |
439 | memory = 0; |
440 | j += shift; |
441 | continue; |
442 | } |
443 | /* Scan for matches in right half. The last byte has |
444 | already been matched, by virtue of the shift table. */ |
445 | i = MAX (suffix, memory); |
446 | pneedle = &needle[i]; |
447 | phaystack = &haystack[i + j]; |
448 | while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++) |
449 | == CANON_ELEMENT (*phaystack++))) |
450 | ++i; |
451 | if (needle_len - 1 <= i) |
452 | { |
453 | /* Scan for matches in left half. */ |
454 | i = suffix - 1; |
455 | pneedle = &needle[i]; |
456 | phaystack = &haystack[i + j]; |
457 | while (memory < i + 1 && (CANON_ELEMENT (*pneedle--) |
458 | == CANON_ELEMENT (*phaystack--))) |
459 | --i; |
460 | if (i + 1 < memory + 1) |
461 | return (RETURN_TYPE) (haystack + j); |
462 | /* No match, so remember how many repetitions of period |
463 | on the right half were scanned. */ |
464 | j += period; |
465 | memory = needle_len - period; |
466 | } |
467 | else |
468 | { |
469 | j += i - suffix + 1; |
470 | memory = 0; |
471 | } |
472 | } |
473 | } |
474 | else |
475 | { |
476 | /* The two halves of needle are distinct; no extra memory is |
477 | required, and any mismatch results in a maximal shift. */ |
478 | size_t shift; |
479 | period = MAX (suffix, needle_len - suffix) + 1; |
480 | j = 0; |
481 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
482 | { |
483 | const unsigned char *pneedle; |
484 | const unsigned char *phaystack; |
485 | |
486 | /* Check the last byte first; if it does not match, then |
487 | shift to the next possible match location. */ |
488 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
489 | if (0 < shift) |
490 | { |
491 | j += shift; |
492 | continue; |
493 | } |
494 | /* Scan for matches in right half. The last byte has |
495 | already been matched, by virtue of the shift table. */ |
496 | i = suffix; |
497 | pneedle = &needle[i]; |
498 | phaystack = &haystack[i + j]; |
499 | while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++) |
500 | == CANON_ELEMENT (*phaystack++))) |
501 | ++i; |
502 | if (needle_len - 1 <= i) |
503 | { |
504 | /* Scan for matches in left half. */ |
505 | i = suffix - 1; |
506 | pneedle = &needle[i]; |
507 | phaystack = &haystack[i + j]; |
508 | while (i != SIZE_MAX && (CANON_ELEMENT (*pneedle--) |
509 | == CANON_ELEMENT (*phaystack--))) |
510 | --i; |
511 | if (i == SIZE_MAX) |
512 | return (RETURN_TYPE) (haystack + j); |
513 | j += period; |
514 | } |
515 | else |
516 | j += i - suffix + 1; |
517 | } |
518 | } |
519 | return NULL; |
520 | } |
521 | |
522 | #undef AVAILABLE |
523 | #undef CANON_ELEMENT |
524 | #undef CMP_FUNC |
525 | #undef RET0_IF_0 |
526 | #undef RETURN_TYPE |
527 | #undef CHECK_EOL |
528 | |