1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. |
2 | Copyright (C) 2018-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRCMP |
24 | # define STRCMP __strcmp_avx2 |
25 | # endif |
26 | |
27 | # define PAGE_SIZE 4096 |
28 | |
29 | /* VEC_SIZE = Number of bytes in a ymm register */ |
30 | # define VEC_SIZE 32 |
31 | |
32 | /* Shift for dividing by (VEC_SIZE * 4). */ |
33 | # define DIVIDE_BY_VEC_4_SHIFT 7 |
34 | # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
35 | # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
36 | # endif |
37 | |
38 | # ifdef USE_AS_WCSCMP |
39 | /* Compare packed dwords. */ |
40 | # define VPCMPEQ vpcmpeqd |
41 | /* Compare packed dwords and store minimum. */ |
42 | # define VPMINU vpminud |
43 | /* 1 dword char == 4 bytes. */ |
44 | # define SIZE_OF_CHAR 4 |
45 | # else |
46 | /* Compare packed bytes. */ |
47 | # define VPCMPEQ vpcmpeqb |
48 | /* Compare packed bytes and store minimum. */ |
49 | # define VPMINU vpminub |
50 | /* 1 byte char == 1 byte. */ |
51 | # define SIZE_OF_CHAR 1 |
52 | # endif |
53 | |
54 | # ifndef VZEROUPPER |
55 | # define VZEROUPPER vzeroupper |
56 | # endif |
57 | |
58 | # ifndef SECTION |
59 | # define SECTION(p) p##.avx |
60 | # endif |
61 | |
62 | /* Warning! |
63 | wcscmp/wcsncmp have to use SIGNED comparison for elements. |
64 | strcmp/strncmp have to use UNSIGNED comparison for elements. |
65 | */ |
66 | |
67 | /* The main idea of the string comparison (byte or dword) using AVX2 |
68 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on |
69 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order |
70 | to check the null char, algorithm keeps the matched bytes/dwords, |
71 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, |
72 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and |
73 | one VPMINU instructions, together with movdqu and testl instructions. |
74 | Main loop (away from from page boundary) compares 4 vectors are a time, |
75 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. |
76 | |
77 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
78 | is the same as strcmp, except that an a maximum offset is tracked. If |
79 | the maximum offset is reached before a difference is found, zero is |
80 | returned. */ |
81 | |
82 | .section SECTION(.text),"ax" ,@progbits |
83 | ENTRY (STRCMP) |
84 | # ifdef USE_AS_STRNCMP |
85 | /* Check for simple cases (0 or 1) in offset. */ |
86 | cmp $1, %RDX_LP |
87 | je L(char0) |
88 | jb L(zero) |
89 | # ifdef USE_AS_WCSCMP |
90 | # ifndef __ILP32__ |
91 | movq %rdx, %rcx |
92 | /* Check if length could overflow when multiplied by |
93 | sizeof(wchar_t). Checking top 8 bits will cover all potential |
94 | overflow cases as well as redirect cases where its impossible to |
95 | length to bound a valid memory region. In these cases just use |
96 | 'wcscmp'. */ |
97 | shrq $56, %rcx |
98 | jnz __wcscmp_avx2 |
99 | # endif |
100 | /* Convert units: from wide to byte char. */ |
101 | shl $2, %RDX_LP |
102 | # endif |
103 | /* Register %r11 tracks the maximum offset. */ |
104 | mov %RDX_LP, %R11_LP |
105 | # endif |
106 | movl %edi, %eax |
107 | xorl %edx, %edx |
108 | /* Make %xmm7 (%ymm7) all zeros in this function. */ |
109 | vpxor %xmm7, %xmm7, %xmm7 |
110 | orl %esi, %eax |
111 | andl $(PAGE_SIZE - 1), %eax |
112 | cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax |
113 | jg L(cross_page) |
114 | /* Start comparing 4 vectors. */ |
115 | vmovdqu (%rdi), %ymm1 |
116 | VPCMPEQ (%rsi), %ymm1, %ymm0 |
117 | VPMINU %ymm1, %ymm0, %ymm0 |
118 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
119 | vpmovmskb %ymm0, %ecx |
120 | testl %ecx, %ecx |
121 | je L(next_3_vectors) |
122 | tzcntl %ecx, %edx |
123 | # ifdef USE_AS_STRNCMP |
124 | /* Return 0 if the mismatched index (%rdx) is after the maximum |
125 | offset (%r11). */ |
126 | cmpq %r11, %rdx |
127 | jae L(zero) |
128 | # endif |
129 | # ifdef USE_AS_WCSCMP |
130 | xorl %eax, %eax |
131 | movl (%rdi, %rdx), %ecx |
132 | cmpl (%rsi, %rdx), %ecx |
133 | je L(return) |
134 | L(wcscmp_return): |
135 | setl %al |
136 | negl %eax |
137 | orl $1, %eax |
138 | L(return): |
139 | # else |
140 | movzbl (%rdi, %rdx), %eax |
141 | movzbl (%rsi, %rdx), %edx |
142 | subl %edx, %eax |
143 | # endif |
144 | L(return_vzeroupper): |
145 | ZERO_UPPER_VEC_REGISTERS_RETURN |
146 | |
147 | .p2align 4 |
148 | L(return_vec_size): |
149 | tzcntl %ecx, %edx |
150 | # ifdef USE_AS_STRNCMP |
151 | /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after |
152 | the maximum offset (%r11). */ |
153 | addq $VEC_SIZE, %rdx |
154 | cmpq %r11, %rdx |
155 | jae L(zero) |
156 | # ifdef USE_AS_WCSCMP |
157 | xorl %eax, %eax |
158 | movl (%rdi, %rdx), %ecx |
159 | cmpl (%rsi, %rdx), %ecx |
160 | jne L(wcscmp_return) |
161 | # else |
162 | movzbl (%rdi, %rdx), %eax |
163 | movzbl (%rsi, %rdx), %edx |
164 | subl %edx, %eax |
165 | # endif |
166 | # else |
167 | # ifdef USE_AS_WCSCMP |
168 | xorl %eax, %eax |
169 | movl VEC_SIZE(%rdi, %rdx), %ecx |
170 | cmpl VEC_SIZE(%rsi, %rdx), %ecx |
171 | jne L(wcscmp_return) |
172 | # else |
173 | movzbl VEC_SIZE(%rdi, %rdx), %eax |
174 | movzbl VEC_SIZE(%rsi, %rdx), %edx |
175 | subl %edx, %eax |
176 | # endif |
177 | # endif |
178 | VZEROUPPER_RETURN |
179 | |
180 | .p2align 4 |
181 | L(return_2_vec_size): |
182 | tzcntl %ecx, %edx |
183 | # ifdef USE_AS_STRNCMP |
184 | /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is |
185 | after the maximum offset (%r11). */ |
186 | addq $(VEC_SIZE * 2), %rdx |
187 | cmpq %r11, %rdx |
188 | jae L(zero) |
189 | # ifdef USE_AS_WCSCMP |
190 | xorl %eax, %eax |
191 | movl (%rdi, %rdx), %ecx |
192 | cmpl (%rsi, %rdx), %ecx |
193 | jne L(wcscmp_return) |
194 | # else |
195 | movzbl (%rdi, %rdx), %eax |
196 | movzbl (%rsi, %rdx), %edx |
197 | subl %edx, %eax |
198 | # endif |
199 | # else |
200 | # ifdef USE_AS_WCSCMP |
201 | xorl %eax, %eax |
202 | movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx |
203 | cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx |
204 | jne L(wcscmp_return) |
205 | # else |
206 | movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax |
207 | movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx |
208 | subl %edx, %eax |
209 | # endif |
210 | # endif |
211 | VZEROUPPER_RETURN |
212 | |
213 | .p2align 4 |
214 | L(return_3_vec_size): |
215 | tzcntl %ecx, %edx |
216 | # ifdef USE_AS_STRNCMP |
217 | /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is |
218 | after the maximum offset (%r11). */ |
219 | addq $(VEC_SIZE * 3), %rdx |
220 | cmpq %r11, %rdx |
221 | jae L(zero) |
222 | # ifdef USE_AS_WCSCMP |
223 | xorl %eax, %eax |
224 | movl (%rdi, %rdx), %ecx |
225 | cmpl (%rsi, %rdx), %ecx |
226 | jne L(wcscmp_return) |
227 | # else |
228 | movzbl (%rdi, %rdx), %eax |
229 | movzbl (%rsi, %rdx), %edx |
230 | subl %edx, %eax |
231 | # endif |
232 | # else |
233 | # ifdef USE_AS_WCSCMP |
234 | xorl %eax, %eax |
235 | movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx |
236 | cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx |
237 | jne L(wcscmp_return) |
238 | # else |
239 | movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax |
240 | movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx |
241 | subl %edx, %eax |
242 | # endif |
243 | # endif |
244 | VZEROUPPER_RETURN |
245 | |
246 | .p2align 4 |
247 | L(next_3_vectors): |
248 | vmovdqu VEC_SIZE(%rdi), %ymm6 |
249 | VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 |
250 | VPMINU %ymm6, %ymm3, %ymm3 |
251 | VPCMPEQ %ymm7, %ymm3, %ymm3 |
252 | vpmovmskb %ymm3, %ecx |
253 | testl %ecx, %ecx |
254 | jne L(return_vec_size) |
255 | vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 |
256 | vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 |
257 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 |
258 | VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 |
259 | VPMINU %ymm5, %ymm2, %ymm2 |
260 | VPCMPEQ %ymm4, %ymm0, %ymm0 |
261 | VPCMPEQ %ymm7, %ymm2, %ymm2 |
262 | vpmovmskb %ymm2, %ecx |
263 | testl %ecx, %ecx |
264 | jne L(return_2_vec_size) |
265 | VPMINU %ymm4, %ymm0, %ymm0 |
266 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
267 | vpmovmskb %ymm0, %ecx |
268 | testl %ecx, %ecx |
269 | jne L(return_3_vec_size) |
270 | L(main_loop_header): |
271 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
272 | movl $PAGE_SIZE, %ecx |
273 | /* Align load via RAX. */ |
274 | andq $-(VEC_SIZE * 4), %rdx |
275 | subq %rdi, %rdx |
276 | leaq (%rdi, %rdx), %rax |
277 | # ifdef USE_AS_STRNCMP |
278 | /* Starting from this point, the maximum offset, or simply the |
279 | 'offset', DECREASES by the same amount when base pointers are |
280 | moved forward. Return 0 when: |
281 | 1) On match: offset <= the matched vector index. |
282 | 2) On mistmach, offset is before the mistmatched index. |
283 | */ |
284 | subq %rdx, %r11 |
285 | jbe L(zero) |
286 | # endif |
287 | addq %rsi, %rdx |
288 | movq %rdx, %rsi |
289 | andl $(PAGE_SIZE - 1), %esi |
290 | /* Number of bytes before page crossing. */ |
291 | subq %rsi, %rcx |
292 | /* Number of VEC_SIZE * 4 blocks before page crossing. */ |
293 | shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx |
294 | /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ |
295 | movl %ecx, %esi |
296 | jmp L(loop_start) |
297 | |
298 | .p2align 4 |
299 | L(loop): |
300 | # ifdef USE_AS_STRNCMP |
301 | /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease |
302 | the maximum offset (%r11) by the same amount. */ |
303 | subq $(VEC_SIZE * 4), %r11 |
304 | jbe L(zero) |
305 | # endif |
306 | addq $(VEC_SIZE * 4), %rax |
307 | addq $(VEC_SIZE * 4), %rdx |
308 | L(loop_start): |
309 | testl %esi, %esi |
310 | leal -1(%esi), %esi |
311 | je L(loop_cross_page) |
312 | L(back_to_loop): |
313 | /* Main loop, comparing 4 vectors are a time. */ |
314 | vmovdqa (%rax), %ymm0 |
315 | vmovdqa VEC_SIZE(%rax), %ymm3 |
316 | VPCMPEQ (%rdx), %ymm0, %ymm4 |
317 | VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 |
318 | VPMINU %ymm0, %ymm4, %ymm4 |
319 | VPMINU %ymm3, %ymm1, %ymm1 |
320 | vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 |
321 | VPMINU %ymm1, %ymm4, %ymm0 |
322 | vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 |
323 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 |
324 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 |
325 | VPMINU %ymm2, %ymm5, %ymm5 |
326 | VPMINU %ymm3, %ymm6, %ymm6 |
327 | VPMINU %ymm5, %ymm0, %ymm0 |
328 | VPMINU %ymm6, %ymm0, %ymm0 |
329 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
330 | |
331 | /* Test each mask (32 bits) individually because for VEC_SIZE |
332 | == 32 is not possible to OR the four masks and keep all bits |
333 | in a 64-bit integer register, differing from SSE2 strcmp |
334 | where ORing is possible. */ |
335 | vpmovmskb %ymm0, %ecx |
336 | testl %ecx, %ecx |
337 | je L(loop) |
338 | VPCMPEQ %ymm7, %ymm4, %ymm0 |
339 | vpmovmskb %ymm0, %edi |
340 | testl %edi, %edi |
341 | je L(test_vec) |
342 | tzcntl %edi, %ecx |
343 | # ifdef USE_AS_STRNCMP |
344 | cmpq %rcx, %r11 |
345 | jbe L(zero) |
346 | # ifdef USE_AS_WCSCMP |
347 | movq %rax, %rsi |
348 | xorl %eax, %eax |
349 | movl (%rsi, %rcx), %edi |
350 | cmpl (%rdx, %rcx), %edi |
351 | jne L(wcscmp_return) |
352 | # else |
353 | movzbl (%rax, %rcx), %eax |
354 | movzbl (%rdx, %rcx), %edx |
355 | subl %edx, %eax |
356 | # endif |
357 | # else |
358 | # ifdef USE_AS_WCSCMP |
359 | movq %rax, %rsi |
360 | xorl %eax, %eax |
361 | movl (%rsi, %rcx), %edi |
362 | cmpl (%rdx, %rcx), %edi |
363 | jne L(wcscmp_return) |
364 | # else |
365 | movzbl (%rax, %rcx), %eax |
366 | movzbl (%rdx, %rcx), %edx |
367 | subl %edx, %eax |
368 | # endif |
369 | # endif |
370 | VZEROUPPER_RETURN |
371 | |
372 | .p2align 4 |
373 | L(test_vec): |
374 | # ifdef USE_AS_STRNCMP |
375 | /* The first vector matched. Return 0 if the maximum offset |
376 | (%r11) <= VEC_SIZE. */ |
377 | cmpq $VEC_SIZE, %r11 |
378 | jbe L(zero) |
379 | # endif |
380 | VPCMPEQ %ymm7, %ymm1, %ymm1 |
381 | vpmovmskb %ymm1, %ecx |
382 | testl %ecx, %ecx |
383 | je L(test_2_vec) |
384 | tzcntl %ecx, %edi |
385 | # ifdef USE_AS_STRNCMP |
386 | addq $VEC_SIZE, %rdi |
387 | cmpq %rdi, %r11 |
388 | jbe L(zero) |
389 | # ifdef USE_AS_WCSCMP |
390 | movq %rax, %rsi |
391 | xorl %eax, %eax |
392 | movl (%rsi, %rdi), %ecx |
393 | cmpl (%rdx, %rdi), %ecx |
394 | jne L(wcscmp_return) |
395 | # else |
396 | movzbl (%rax, %rdi), %eax |
397 | movzbl (%rdx, %rdi), %edx |
398 | subl %edx, %eax |
399 | # endif |
400 | # else |
401 | # ifdef USE_AS_WCSCMP |
402 | movq %rax, %rsi |
403 | xorl %eax, %eax |
404 | movl VEC_SIZE(%rsi, %rdi), %ecx |
405 | cmpl VEC_SIZE(%rdx, %rdi), %ecx |
406 | jne L(wcscmp_return) |
407 | # else |
408 | movzbl VEC_SIZE(%rax, %rdi), %eax |
409 | movzbl VEC_SIZE(%rdx, %rdi), %edx |
410 | subl %edx, %eax |
411 | # endif |
412 | # endif |
413 | VZEROUPPER_RETURN |
414 | |
415 | .p2align 4 |
416 | L(test_2_vec): |
417 | # ifdef USE_AS_STRNCMP |
418 | /* The first 2 vectors matched. Return 0 if the maximum offset |
419 | (%r11) <= 2 * VEC_SIZE. */ |
420 | cmpq $(VEC_SIZE * 2), %r11 |
421 | jbe L(zero) |
422 | # endif |
423 | VPCMPEQ %ymm7, %ymm5, %ymm5 |
424 | vpmovmskb %ymm5, %ecx |
425 | testl %ecx, %ecx |
426 | je L(test_3_vec) |
427 | tzcntl %ecx, %edi |
428 | # ifdef USE_AS_STRNCMP |
429 | addq $(VEC_SIZE * 2), %rdi |
430 | cmpq %rdi, %r11 |
431 | jbe L(zero) |
432 | # ifdef USE_AS_WCSCMP |
433 | movq %rax, %rsi |
434 | xorl %eax, %eax |
435 | movl (%rsi, %rdi), %ecx |
436 | cmpl (%rdx, %rdi), %ecx |
437 | jne L(wcscmp_return) |
438 | # else |
439 | movzbl (%rax, %rdi), %eax |
440 | movzbl (%rdx, %rdi), %edx |
441 | subl %edx, %eax |
442 | # endif |
443 | # else |
444 | # ifdef USE_AS_WCSCMP |
445 | movq %rax, %rsi |
446 | xorl %eax, %eax |
447 | movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx |
448 | cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx |
449 | jne L(wcscmp_return) |
450 | # else |
451 | movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax |
452 | movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx |
453 | subl %edx, %eax |
454 | # endif |
455 | # endif |
456 | VZEROUPPER_RETURN |
457 | |
458 | .p2align 4 |
459 | L(test_3_vec): |
460 | # ifdef USE_AS_STRNCMP |
461 | /* The first 3 vectors matched. Return 0 if the maximum offset |
462 | (%r11) <= 3 * VEC_SIZE. */ |
463 | cmpq $(VEC_SIZE * 3), %r11 |
464 | jbe L(zero) |
465 | # endif |
466 | VPCMPEQ %ymm7, %ymm6, %ymm6 |
467 | vpmovmskb %ymm6, %esi |
468 | tzcntl %esi, %ecx |
469 | # ifdef USE_AS_STRNCMP |
470 | addq $(VEC_SIZE * 3), %rcx |
471 | cmpq %rcx, %r11 |
472 | jbe L(zero) |
473 | # ifdef USE_AS_WCSCMP |
474 | movq %rax, %rsi |
475 | xorl %eax, %eax |
476 | movl (%rsi, %rcx), %esi |
477 | cmpl (%rdx, %rcx), %esi |
478 | jne L(wcscmp_return) |
479 | # else |
480 | movzbl (%rax, %rcx), %eax |
481 | movzbl (%rdx, %rcx), %edx |
482 | subl %edx, %eax |
483 | # endif |
484 | # else |
485 | # ifdef USE_AS_WCSCMP |
486 | movq %rax, %rsi |
487 | xorl %eax, %eax |
488 | movl (VEC_SIZE * 3)(%rsi, %rcx), %esi |
489 | cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi |
490 | jne L(wcscmp_return) |
491 | # else |
492 | movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax |
493 | movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx |
494 | subl %edx, %eax |
495 | # endif |
496 | # endif |
497 | VZEROUPPER_RETURN |
498 | |
499 | .p2align 4 |
500 | L(loop_cross_page): |
501 | xorl %r10d, %r10d |
502 | movq %rdx, %rcx |
503 | /* Align load via RDX. We load the extra ECX bytes which should |
504 | be ignored. */ |
505 | andl $((VEC_SIZE * 4) - 1), %ecx |
506 | /* R10 is -RCX. */ |
507 | subq %rcx, %r10 |
508 | |
509 | /* This works only if VEC_SIZE * 2 == 64. */ |
510 | # if (VEC_SIZE * 2) != 64 |
511 | # error (VEC_SIZE * 2) != 64 |
512 | # endif |
513 | |
514 | /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ |
515 | cmpl $(VEC_SIZE * 2), %ecx |
516 | jge L(loop_cross_page_2_vec) |
517 | |
518 | vmovdqu (%rax, %r10), %ymm2 |
519 | vmovdqu VEC_SIZE(%rax, %r10), %ymm3 |
520 | VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 |
521 | VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 |
522 | VPMINU %ymm2, %ymm0, %ymm0 |
523 | VPMINU %ymm3, %ymm1, %ymm1 |
524 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
525 | VPCMPEQ %ymm7, %ymm1, %ymm1 |
526 | |
527 | vpmovmskb %ymm0, %edi |
528 | vpmovmskb %ymm1, %esi |
529 | |
530 | salq $32, %rsi |
531 | xorq %rsi, %rdi |
532 | |
533 | /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ |
534 | shrq %cl, %rdi |
535 | |
536 | testq %rdi, %rdi |
537 | je L(loop_cross_page_2_vec) |
538 | tzcntq %rdi, %rcx |
539 | # ifdef USE_AS_STRNCMP |
540 | cmpq %rcx, %r11 |
541 | jbe L(zero) |
542 | # ifdef USE_AS_WCSCMP |
543 | movq %rax, %rsi |
544 | xorl %eax, %eax |
545 | movl (%rsi, %rcx), %edi |
546 | cmpl (%rdx, %rcx), %edi |
547 | jne L(wcscmp_return) |
548 | # else |
549 | movzbl (%rax, %rcx), %eax |
550 | movzbl (%rdx, %rcx), %edx |
551 | subl %edx, %eax |
552 | # endif |
553 | # else |
554 | # ifdef USE_AS_WCSCMP |
555 | movq %rax, %rsi |
556 | xorl %eax, %eax |
557 | movl (%rsi, %rcx), %edi |
558 | cmpl (%rdx, %rcx), %edi |
559 | jne L(wcscmp_return) |
560 | # else |
561 | movzbl (%rax, %rcx), %eax |
562 | movzbl (%rdx, %rcx), %edx |
563 | subl %edx, %eax |
564 | # endif |
565 | # endif |
566 | VZEROUPPER_RETURN |
567 | |
568 | .p2align 4 |
569 | L(loop_cross_page_2_vec): |
570 | /* The first VEC_SIZE * 2 bytes match or are ignored. */ |
571 | vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 |
572 | vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 |
573 | VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 |
574 | VPMINU %ymm2, %ymm5, %ymm5 |
575 | VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 |
576 | VPCMPEQ %ymm7, %ymm5, %ymm5 |
577 | VPMINU %ymm3, %ymm6, %ymm6 |
578 | VPCMPEQ %ymm7, %ymm6, %ymm6 |
579 | |
580 | vpmovmskb %ymm5, %edi |
581 | vpmovmskb %ymm6, %esi |
582 | |
583 | salq $32, %rsi |
584 | xorq %rsi, %rdi |
585 | |
586 | xorl %r8d, %r8d |
587 | /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ |
588 | subl $(VEC_SIZE * 2), %ecx |
589 | jle 1f |
590 | /* Skip ECX bytes. */ |
591 | shrq %cl, %rdi |
592 | /* R8 has number of bytes skipped. */ |
593 | movl %ecx, %r8d |
594 | 1: |
595 | /* Before jumping back to the loop, set ESI to the number of |
596 | VEC_SIZE * 4 blocks before page crossing. */ |
597 | movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi |
598 | |
599 | testq %rdi, %rdi |
600 | # ifdef USE_AS_STRNCMP |
601 | /* At this point, if %rdi value is 0, it already tested |
602 | VEC_SIZE*4+%r10 byte starting from %rax. This label |
603 | checks whether strncmp maximum offset reached or not. */ |
604 | je L(string_nbyte_offset_check) |
605 | # else |
606 | je L(back_to_loop) |
607 | # endif |
608 | tzcntq %rdi, %rcx |
609 | addq %r10, %rcx |
610 | /* Adjust for number of bytes skipped. */ |
611 | addq %r8, %rcx |
612 | # ifdef USE_AS_STRNCMP |
613 | addq $(VEC_SIZE * 2), %rcx |
614 | subq %rcx, %r11 |
615 | jbe L(zero) |
616 | # ifdef USE_AS_WCSCMP |
617 | movq %rax, %rsi |
618 | xorl %eax, %eax |
619 | movl (%rsi, %rcx), %edi |
620 | cmpl (%rdx, %rcx), %edi |
621 | jne L(wcscmp_return) |
622 | # else |
623 | movzbl (%rax, %rcx), %eax |
624 | movzbl (%rdx, %rcx), %edx |
625 | subl %edx, %eax |
626 | # endif |
627 | # else |
628 | # ifdef USE_AS_WCSCMP |
629 | movq %rax, %rsi |
630 | xorl %eax, %eax |
631 | movl (VEC_SIZE * 2)(%rsi, %rcx), %edi |
632 | cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi |
633 | jne L(wcscmp_return) |
634 | # else |
635 | movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax |
636 | movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx |
637 | subl %edx, %eax |
638 | # endif |
639 | # endif |
640 | VZEROUPPER_RETURN |
641 | |
642 | # ifdef USE_AS_STRNCMP |
643 | L(string_nbyte_offset_check): |
644 | leaq (VEC_SIZE * 4)(%r10), %r10 |
645 | cmpq %r10, %r11 |
646 | jbe L(zero) |
647 | jmp L(back_to_loop) |
648 | # endif |
649 | |
650 | .p2align 4 |
651 | L(cross_page_loop): |
652 | /* Check one byte/dword at a time. */ |
653 | # ifdef USE_AS_WCSCMP |
654 | cmpl %ecx, %eax |
655 | # else |
656 | subl %ecx, %eax |
657 | # endif |
658 | jne L(different) |
659 | addl $SIZE_OF_CHAR, %edx |
660 | cmpl $(VEC_SIZE * 4), %edx |
661 | je L(main_loop_header) |
662 | # ifdef USE_AS_STRNCMP |
663 | cmpq %r11, %rdx |
664 | jae L(zero) |
665 | # endif |
666 | # ifdef USE_AS_WCSCMP |
667 | movl (%rdi, %rdx), %eax |
668 | movl (%rsi, %rdx), %ecx |
669 | # else |
670 | movzbl (%rdi, %rdx), %eax |
671 | movzbl (%rsi, %rdx), %ecx |
672 | # endif |
673 | /* Check null char. */ |
674 | testl %eax, %eax |
675 | jne L(cross_page_loop) |
676 | /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED |
677 | comparisons. */ |
678 | subl %ecx, %eax |
679 | # ifndef USE_AS_WCSCMP |
680 | L(different): |
681 | # endif |
682 | VZEROUPPER_RETURN |
683 | |
684 | # ifdef USE_AS_WCSCMP |
685 | .p2align 4 |
686 | L(different): |
687 | /* Use movl to avoid modifying EFLAGS. */ |
688 | movl $0, %eax |
689 | setl %al |
690 | negl %eax |
691 | orl $1, %eax |
692 | VZEROUPPER_RETURN |
693 | # endif |
694 | |
695 | # ifdef USE_AS_STRNCMP |
696 | .p2align 4 |
697 | L(zero): |
698 | xorl %eax, %eax |
699 | VZEROUPPER_RETURN |
700 | |
701 | .p2align 4 |
702 | L(char0): |
703 | # ifdef USE_AS_WCSCMP |
704 | xorl %eax, %eax |
705 | movl (%rdi), %ecx |
706 | cmpl (%rsi), %ecx |
707 | jne L(wcscmp_return) |
708 | # else |
709 | movzbl (%rsi), %ecx |
710 | movzbl (%rdi), %eax |
711 | subl %ecx, %eax |
712 | # endif |
713 | VZEROUPPER_RETURN |
714 | # endif |
715 | |
716 | .p2align 4 |
717 | L(last_vector): |
718 | addq %rdx, %rdi |
719 | addq %rdx, %rsi |
720 | # ifdef USE_AS_STRNCMP |
721 | subq %rdx, %r11 |
722 | # endif |
723 | tzcntl %ecx, %edx |
724 | # ifdef USE_AS_STRNCMP |
725 | cmpq %r11, %rdx |
726 | jae L(zero) |
727 | # endif |
728 | # ifdef USE_AS_WCSCMP |
729 | xorl %eax, %eax |
730 | movl (%rdi, %rdx), %ecx |
731 | cmpl (%rsi, %rdx), %ecx |
732 | jne L(wcscmp_return) |
733 | # else |
734 | movzbl (%rdi, %rdx), %eax |
735 | movzbl (%rsi, %rdx), %edx |
736 | subl %edx, %eax |
737 | # endif |
738 | VZEROUPPER_RETURN |
739 | |
740 | /* Comparing on page boundary region requires special treatment: |
741 | It must done one vector at the time, starting with the wider |
742 | ymm vector if possible, if not, with xmm. If fetching 16 bytes |
743 | (xmm) still passes the boundary, byte comparison must be done. |
744 | */ |
745 | .p2align 4 |
746 | L(cross_page): |
747 | /* Try one ymm vector at a time. */ |
748 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
749 | jg L(cross_page_1_vector) |
750 | L(loop_1_vector): |
751 | vmovdqu (%rdi, %rdx), %ymm1 |
752 | VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 |
753 | VPMINU %ymm1, %ymm0, %ymm0 |
754 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
755 | vpmovmskb %ymm0, %ecx |
756 | testl %ecx, %ecx |
757 | jne L(last_vector) |
758 | |
759 | addl $VEC_SIZE, %edx |
760 | |
761 | addl $VEC_SIZE, %eax |
762 | # ifdef USE_AS_STRNCMP |
763 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
764 | (%r11). */ |
765 | cmpq %r11, %rdx |
766 | jae L(zero) |
767 | # endif |
768 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
769 | jle L(loop_1_vector) |
770 | L(cross_page_1_vector): |
771 | /* Less than 32 bytes to check, try one xmm vector. */ |
772 | cmpl $(PAGE_SIZE - 16), %eax |
773 | jg L(cross_page_1_xmm) |
774 | vmovdqu (%rdi, %rdx), %xmm1 |
775 | VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 |
776 | VPMINU %xmm1, %xmm0, %xmm0 |
777 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
778 | vpmovmskb %xmm0, %ecx |
779 | testl %ecx, %ecx |
780 | jne L(last_vector) |
781 | |
782 | addl $16, %edx |
783 | # ifndef USE_AS_WCSCMP |
784 | addl $16, %eax |
785 | # endif |
786 | # ifdef USE_AS_STRNCMP |
787 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
788 | (%r11). */ |
789 | cmpq %r11, %rdx |
790 | jae L(zero) |
791 | # endif |
792 | |
793 | L(cross_page_1_xmm): |
794 | # ifndef USE_AS_WCSCMP |
795 | /* Less than 16 bytes to check, try 8 byte vector. NB: No need |
796 | for wcscmp nor wcsncmp since wide char is 4 bytes. */ |
797 | cmpl $(PAGE_SIZE - 8), %eax |
798 | jg L(cross_page_8bytes) |
799 | vmovq (%rdi, %rdx), %xmm1 |
800 | vmovq (%rsi, %rdx), %xmm0 |
801 | VPCMPEQ %xmm0, %xmm1, %xmm0 |
802 | VPMINU %xmm1, %xmm0, %xmm0 |
803 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
804 | vpmovmskb %xmm0, %ecx |
805 | /* Only last 8 bits are valid. */ |
806 | andl $0xff, %ecx |
807 | testl %ecx, %ecx |
808 | jne L(last_vector) |
809 | |
810 | addl $8, %edx |
811 | addl $8, %eax |
812 | # ifdef USE_AS_STRNCMP |
813 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
814 | (%r11). */ |
815 | cmpq %r11, %rdx |
816 | jae L(zero) |
817 | # endif |
818 | |
819 | L(cross_page_8bytes): |
820 | /* Less than 8 bytes to check, try 4 byte vector. */ |
821 | cmpl $(PAGE_SIZE - 4), %eax |
822 | jg L(cross_page_4bytes) |
823 | vmovd (%rdi, %rdx), %xmm1 |
824 | vmovd (%rsi, %rdx), %xmm0 |
825 | VPCMPEQ %xmm0, %xmm1, %xmm0 |
826 | VPMINU %xmm1, %xmm0, %xmm0 |
827 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
828 | vpmovmskb %xmm0, %ecx |
829 | /* Only last 4 bits are valid. */ |
830 | andl $0xf, %ecx |
831 | testl %ecx, %ecx |
832 | jne L(last_vector) |
833 | |
834 | addl $4, %edx |
835 | # ifdef USE_AS_STRNCMP |
836 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
837 | (%r11). */ |
838 | cmpq %r11, %rdx |
839 | jae L(zero) |
840 | # endif |
841 | |
842 | L(cross_page_4bytes): |
843 | # endif |
844 | /* Less than 4 bytes to check, try one byte/dword at a time. */ |
845 | # ifdef USE_AS_STRNCMP |
846 | cmpq %r11, %rdx |
847 | jae L(zero) |
848 | # endif |
849 | # ifdef USE_AS_WCSCMP |
850 | movl (%rdi, %rdx), %eax |
851 | movl (%rsi, %rdx), %ecx |
852 | # else |
853 | movzbl (%rdi, %rdx), %eax |
854 | movzbl (%rsi, %rdx), %ecx |
855 | # endif |
856 | testl %eax, %eax |
857 | jne L(cross_page_loop) |
858 | subl %ecx, %eax |
859 | VZEROUPPER_RETURN |
860 | END (STRCMP) |
861 | #endif |
862 | |