1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. |
2 | Copyright (C) 2018-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRCMP |
24 | # define STRCMP __strcmp_avx2 |
25 | # endif |
26 | |
27 | # define PAGE_SIZE 4096 |
28 | |
29 | /* VEC_SIZE = Number of bytes in a ymm register */ |
30 | # define VEC_SIZE 32 |
31 | |
32 | /* Shift for dividing by (VEC_SIZE * 4). */ |
33 | # define DIVIDE_BY_VEC_4_SHIFT 7 |
34 | # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
35 | # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
36 | # endif |
37 | |
38 | # ifdef USE_AS_WCSCMP |
39 | /* Compare packed dwords. */ |
40 | # define VPCMPEQ vpcmpeqd |
41 | /* Compare packed dwords and store minimum. */ |
42 | # define VPMINU vpminud |
43 | /* 1 dword char == 4 bytes. */ |
44 | # define SIZE_OF_CHAR 4 |
45 | # else |
46 | /* Compare packed bytes. */ |
47 | # define VPCMPEQ vpcmpeqb |
48 | /* Compare packed bytes and store minimum. */ |
49 | # define VPMINU vpminub |
50 | /* 1 byte char == 1 byte. */ |
51 | # define SIZE_OF_CHAR 1 |
52 | # endif |
53 | |
54 | # ifndef VZEROUPPER |
55 | # define VZEROUPPER vzeroupper |
56 | # endif |
57 | |
58 | /* Warning! |
59 | wcscmp/wcsncmp have to use SIGNED comparison for elements. |
60 | strcmp/strncmp have to use UNSIGNED comparison for elements. |
61 | */ |
62 | |
63 | /* The main idea of the string comparison (byte or dword) using AVX2 |
64 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on |
65 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order |
66 | to check the null char, algorithm keeps the matched bytes/dwords, |
67 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, |
68 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and |
69 | one VPMINU instructions, together with movdqu and testl instructions. |
70 | Main loop (away from from page boundary) compares 4 vectors are a time, |
71 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. |
72 | |
73 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
74 | is the same as strcmp, except that an a maximum offset is tracked. If |
75 | the maximum offset is reached before a difference is found, zero is |
76 | returned. */ |
77 | |
78 | .section .text.avx,"ax" ,@progbits |
79 | ENTRY (STRCMP) |
80 | # ifdef USE_AS_STRNCMP |
81 | /* Check for simple cases (0 or 1) in offset. */ |
82 | cmp $1, %RDX_LP |
83 | je L(char0) |
84 | jb L(zero) |
85 | # ifdef USE_AS_WCSCMP |
86 | /* Convert units: from wide to byte char. */ |
87 | shl $2, %RDX_LP |
88 | # endif |
89 | /* Register %r11 tracks the maximum offset. */ |
90 | mov %RDX_LP, %R11_LP |
91 | # endif |
92 | movl %edi, %eax |
93 | xorl %edx, %edx |
94 | /* Make %xmm7 (%ymm7) all zeros in this function. */ |
95 | vpxor %xmm7, %xmm7, %xmm7 |
96 | orl %esi, %eax |
97 | andl $(PAGE_SIZE - 1), %eax |
98 | cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax |
99 | jg L(cross_page) |
100 | /* Start comparing 4 vectors. */ |
101 | vmovdqu (%rdi), %ymm1 |
102 | VPCMPEQ (%rsi), %ymm1, %ymm0 |
103 | VPMINU %ymm1, %ymm0, %ymm0 |
104 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
105 | vpmovmskb %ymm0, %ecx |
106 | testl %ecx, %ecx |
107 | je L(next_3_vectors) |
108 | tzcntl %ecx, %edx |
109 | # ifdef USE_AS_STRNCMP |
110 | /* Return 0 if the mismatched index (%rdx) is after the maximum |
111 | offset (%r11). */ |
112 | cmpq %r11, %rdx |
113 | jae L(zero) |
114 | # endif |
115 | # ifdef USE_AS_WCSCMP |
116 | xorl %eax, %eax |
117 | movl (%rdi, %rdx), %ecx |
118 | cmpl (%rsi, %rdx), %ecx |
119 | je L(return) |
120 | L(wcscmp_return): |
121 | setl %al |
122 | negl %eax |
123 | orl $1, %eax |
124 | L(return): |
125 | # else |
126 | movzbl (%rdi, %rdx), %eax |
127 | movzbl (%rsi, %rdx), %edx |
128 | subl %edx, %eax |
129 | # endif |
130 | VZEROUPPER |
131 | ret |
132 | |
133 | .p2align 4 |
134 | L(return_vec_size): |
135 | tzcntl %ecx, %edx |
136 | # ifdef USE_AS_STRNCMP |
137 | /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after |
138 | the maximum offset (%r11). */ |
139 | addq $VEC_SIZE, %rdx |
140 | cmpq %r11, %rdx |
141 | jae L(zero) |
142 | # ifdef USE_AS_WCSCMP |
143 | xorl %eax, %eax |
144 | movl (%rdi, %rdx), %ecx |
145 | cmpl (%rsi, %rdx), %ecx |
146 | jne L(wcscmp_return) |
147 | # else |
148 | movzbl (%rdi, %rdx), %eax |
149 | movzbl (%rsi, %rdx), %edx |
150 | subl %edx, %eax |
151 | # endif |
152 | # else |
153 | # ifdef USE_AS_WCSCMP |
154 | xorl %eax, %eax |
155 | movl VEC_SIZE(%rdi, %rdx), %ecx |
156 | cmpl VEC_SIZE(%rsi, %rdx), %ecx |
157 | jne L(wcscmp_return) |
158 | # else |
159 | movzbl VEC_SIZE(%rdi, %rdx), %eax |
160 | movzbl VEC_SIZE(%rsi, %rdx), %edx |
161 | subl %edx, %eax |
162 | # endif |
163 | # endif |
164 | VZEROUPPER |
165 | ret |
166 | |
167 | .p2align 4 |
168 | L(return_2_vec_size): |
169 | tzcntl %ecx, %edx |
170 | # ifdef USE_AS_STRNCMP |
171 | /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is |
172 | after the maximum offset (%r11). */ |
173 | addq $(VEC_SIZE * 2), %rdx |
174 | cmpq %r11, %rdx |
175 | jae L(zero) |
176 | # ifdef USE_AS_WCSCMP |
177 | xorl %eax, %eax |
178 | movl (%rdi, %rdx), %ecx |
179 | cmpl (%rsi, %rdx), %ecx |
180 | jne L(wcscmp_return) |
181 | # else |
182 | movzbl (%rdi, %rdx), %eax |
183 | movzbl (%rsi, %rdx), %edx |
184 | subl %edx, %eax |
185 | # endif |
186 | # else |
187 | # ifdef USE_AS_WCSCMP |
188 | xorl %eax, %eax |
189 | movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx |
190 | cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx |
191 | jne L(wcscmp_return) |
192 | # else |
193 | movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax |
194 | movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx |
195 | subl %edx, %eax |
196 | # endif |
197 | # endif |
198 | VZEROUPPER |
199 | ret |
200 | |
201 | .p2align 4 |
202 | L(return_3_vec_size): |
203 | tzcntl %ecx, %edx |
204 | # ifdef USE_AS_STRNCMP |
205 | /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is |
206 | after the maximum offset (%r11). */ |
207 | addq $(VEC_SIZE * 3), %rdx |
208 | cmpq %r11, %rdx |
209 | jae L(zero) |
210 | # ifdef USE_AS_WCSCMP |
211 | xorl %eax, %eax |
212 | movl (%rdi, %rdx), %ecx |
213 | cmpl (%rsi, %rdx), %ecx |
214 | jne L(wcscmp_return) |
215 | # else |
216 | movzbl (%rdi, %rdx), %eax |
217 | movzbl (%rsi, %rdx), %edx |
218 | subl %edx, %eax |
219 | # endif |
220 | # else |
221 | # ifdef USE_AS_WCSCMP |
222 | xorl %eax, %eax |
223 | movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx |
224 | cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx |
225 | jne L(wcscmp_return) |
226 | # else |
227 | movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax |
228 | movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx |
229 | subl %edx, %eax |
230 | # endif |
231 | # endif |
232 | VZEROUPPER |
233 | ret |
234 | |
235 | .p2align 4 |
236 | L(next_3_vectors): |
237 | vmovdqu VEC_SIZE(%rdi), %ymm6 |
238 | VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 |
239 | VPMINU %ymm6, %ymm3, %ymm3 |
240 | VPCMPEQ %ymm7, %ymm3, %ymm3 |
241 | vpmovmskb %ymm3, %ecx |
242 | testl %ecx, %ecx |
243 | jne L(return_vec_size) |
244 | vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 |
245 | vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 |
246 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 |
247 | VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 |
248 | VPMINU %ymm5, %ymm2, %ymm2 |
249 | VPCMPEQ %ymm4, %ymm0, %ymm0 |
250 | VPCMPEQ %ymm7, %ymm2, %ymm2 |
251 | vpmovmskb %ymm2, %ecx |
252 | testl %ecx, %ecx |
253 | jne L(return_2_vec_size) |
254 | VPMINU %ymm4, %ymm0, %ymm0 |
255 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
256 | vpmovmskb %ymm0, %ecx |
257 | testl %ecx, %ecx |
258 | jne L(return_3_vec_size) |
259 | L(main_loop_header): |
260 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
261 | movl $PAGE_SIZE, %ecx |
262 | /* Align load via RAX. */ |
263 | andq $-(VEC_SIZE * 4), %rdx |
264 | subq %rdi, %rdx |
265 | leaq (%rdi, %rdx), %rax |
266 | # ifdef USE_AS_STRNCMP |
267 | /* Starting from this point, the maximum offset, or simply the |
268 | 'offset', DECREASES by the same amount when base pointers are |
269 | moved forward. Return 0 when: |
270 | 1) On match: offset <= the matched vector index. |
271 | 2) On mistmach, offset is before the mistmatched index. |
272 | */ |
273 | subq %rdx, %r11 |
274 | jbe L(zero) |
275 | # endif |
276 | addq %rsi, %rdx |
277 | movq %rdx, %rsi |
278 | andl $(PAGE_SIZE - 1), %esi |
279 | /* Number of bytes before page crossing. */ |
280 | subq %rsi, %rcx |
281 | /* Number of VEC_SIZE * 4 blocks before page crossing. */ |
282 | shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx |
283 | /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ |
284 | movl %ecx, %esi |
285 | jmp L(loop_start) |
286 | |
287 | .p2align 4 |
288 | L(loop): |
289 | # ifdef USE_AS_STRNCMP |
290 | /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease |
291 | the maximum offset (%r11) by the same amount. */ |
292 | subq $(VEC_SIZE * 4), %r11 |
293 | jbe L(zero) |
294 | # endif |
295 | addq $(VEC_SIZE * 4), %rax |
296 | addq $(VEC_SIZE * 4), %rdx |
297 | L(loop_start): |
298 | testl %esi, %esi |
299 | leal -1(%esi), %esi |
300 | je L(loop_cross_page) |
301 | L(back_to_loop): |
302 | /* Main loop, comparing 4 vectors are a time. */ |
303 | vmovdqa (%rax), %ymm0 |
304 | vmovdqa VEC_SIZE(%rax), %ymm3 |
305 | VPCMPEQ (%rdx), %ymm0, %ymm4 |
306 | VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 |
307 | VPMINU %ymm0, %ymm4, %ymm4 |
308 | VPMINU %ymm3, %ymm1, %ymm1 |
309 | vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 |
310 | VPMINU %ymm1, %ymm4, %ymm0 |
311 | vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 |
312 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 |
313 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 |
314 | VPMINU %ymm2, %ymm5, %ymm5 |
315 | VPMINU %ymm3, %ymm6, %ymm6 |
316 | VPMINU %ymm5, %ymm0, %ymm0 |
317 | VPMINU %ymm6, %ymm0, %ymm0 |
318 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
319 | |
320 | /* Test each mask (32 bits) individually because for VEC_SIZE |
321 | == 32 is not possible to OR the four masks and keep all bits |
322 | in a 64-bit integer register, differing from SSE2 strcmp |
323 | where ORing is possible. */ |
324 | vpmovmskb %ymm0, %ecx |
325 | testl %ecx, %ecx |
326 | je L(loop) |
327 | VPCMPEQ %ymm7, %ymm4, %ymm0 |
328 | vpmovmskb %ymm0, %edi |
329 | testl %edi, %edi |
330 | je L(test_vec) |
331 | tzcntl %edi, %ecx |
332 | # ifdef USE_AS_STRNCMP |
333 | cmpq %rcx, %r11 |
334 | jbe L(zero) |
335 | # ifdef USE_AS_WCSCMP |
336 | movq %rax, %rsi |
337 | xorl %eax, %eax |
338 | movl (%rsi, %rcx), %edi |
339 | cmpl (%rdx, %rcx), %edi |
340 | jne L(wcscmp_return) |
341 | # else |
342 | movzbl (%rax, %rcx), %eax |
343 | movzbl (%rdx, %rcx), %edx |
344 | subl %edx, %eax |
345 | # endif |
346 | # else |
347 | # ifdef USE_AS_WCSCMP |
348 | movq %rax, %rsi |
349 | xorl %eax, %eax |
350 | movl (%rsi, %rcx), %edi |
351 | cmpl (%rdx, %rcx), %edi |
352 | jne L(wcscmp_return) |
353 | # else |
354 | movzbl (%rax, %rcx), %eax |
355 | movzbl (%rdx, %rcx), %edx |
356 | subl %edx, %eax |
357 | # endif |
358 | # endif |
359 | VZEROUPPER |
360 | ret |
361 | |
362 | .p2align 4 |
363 | L(test_vec): |
364 | # ifdef USE_AS_STRNCMP |
365 | /* The first vector matched. Return 0 if the maximum offset |
366 | (%r11) <= VEC_SIZE. */ |
367 | cmpq $VEC_SIZE, %r11 |
368 | jbe L(zero) |
369 | # endif |
370 | VPCMPEQ %ymm7, %ymm1, %ymm1 |
371 | vpmovmskb %ymm1, %ecx |
372 | testl %ecx, %ecx |
373 | je L(test_2_vec) |
374 | tzcntl %ecx, %edi |
375 | # ifdef USE_AS_STRNCMP |
376 | addq $VEC_SIZE, %rdi |
377 | cmpq %rdi, %r11 |
378 | jbe L(zero) |
379 | # ifdef USE_AS_WCSCMP |
380 | movq %rax, %rsi |
381 | xorl %eax, %eax |
382 | movl (%rsi, %rdi), %ecx |
383 | cmpl (%rdx, %rdi), %ecx |
384 | jne L(wcscmp_return) |
385 | # else |
386 | movzbl (%rax, %rdi), %eax |
387 | movzbl (%rdx, %rdi), %edx |
388 | subl %edx, %eax |
389 | # endif |
390 | # else |
391 | # ifdef USE_AS_WCSCMP |
392 | movq %rax, %rsi |
393 | xorl %eax, %eax |
394 | movl VEC_SIZE(%rsi, %rdi), %ecx |
395 | cmpl VEC_SIZE(%rdx, %rdi), %ecx |
396 | jne L(wcscmp_return) |
397 | # else |
398 | movzbl VEC_SIZE(%rax, %rdi), %eax |
399 | movzbl VEC_SIZE(%rdx, %rdi), %edx |
400 | subl %edx, %eax |
401 | # endif |
402 | # endif |
403 | VZEROUPPER |
404 | ret |
405 | |
406 | .p2align 4 |
407 | L(test_2_vec): |
408 | # ifdef USE_AS_STRNCMP |
409 | /* The first 2 vectors matched. Return 0 if the maximum offset |
410 | (%r11) <= 2 * VEC_SIZE. */ |
411 | cmpq $(VEC_SIZE * 2), %r11 |
412 | jbe L(zero) |
413 | # endif |
414 | VPCMPEQ %ymm7, %ymm5, %ymm5 |
415 | vpmovmskb %ymm5, %ecx |
416 | testl %ecx, %ecx |
417 | je L(test_3_vec) |
418 | tzcntl %ecx, %edi |
419 | # ifdef USE_AS_STRNCMP |
420 | addq $(VEC_SIZE * 2), %rdi |
421 | cmpq %rdi, %r11 |
422 | jbe L(zero) |
423 | # ifdef USE_AS_WCSCMP |
424 | movq %rax, %rsi |
425 | xorl %eax, %eax |
426 | movl (%rsi, %rdi), %ecx |
427 | cmpl (%rdx, %rdi), %ecx |
428 | jne L(wcscmp_return) |
429 | # else |
430 | movzbl (%rax, %rdi), %eax |
431 | movzbl (%rdx, %rdi), %edx |
432 | subl %edx, %eax |
433 | # endif |
434 | # else |
435 | # ifdef USE_AS_WCSCMP |
436 | movq %rax, %rsi |
437 | xorl %eax, %eax |
438 | movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx |
439 | cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx |
440 | jne L(wcscmp_return) |
441 | # else |
442 | movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax |
443 | movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx |
444 | subl %edx, %eax |
445 | # endif |
446 | # endif |
447 | VZEROUPPER |
448 | ret |
449 | |
450 | .p2align 4 |
451 | L(test_3_vec): |
452 | # ifdef USE_AS_STRNCMP |
453 | /* The first 3 vectors matched. Return 0 if the maximum offset |
454 | (%r11) <= 3 * VEC_SIZE. */ |
455 | cmpq $(VEC_SIZE * 3), %r11 |
456 | jbe L(zero) |
457 | # endif |
458 | VPCMPEQ %ymm7, %ymm6, %ymm6 |
459 | vpmovmskb %ymm6, %esi |
460 | tzcntl %esi, %ecx |
461 | # ifdef USE_AS_STRNCMP |
462 | addq $(VEC_SIZE * 3), %rcx |
463 | cmpq %rcx, %r11 |
464 | jbe L(zero) |
465 | # ifdef USE_AS_WCSCMP |
466 | movq %rax, %rsi |
467 | xorl %eax, %eax |
468 | movl (%rsi, %rcx), %esi |
469 | cmpl (%rdx, %rcx), %esi |
470 | jne L(wcscmp_return) |
471 | # else |
472 | movzbl (%rax, %rcx), %eax |
473 | movzbl (%rdx, %rcx), %edx |
474 | subl %edx, %eax |
475 | # endif |
476 | # else |
477 | # ifdef USE_AS_WCSCMP |
478 | movq %rax, %rsi |
479 | xorl %eax, %eax |
480 | movl (VEC_SIZE * 3)(%rsi, %rcx), %esi |
481 | cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi |
482 | jne L(wcscmp_return) |
483 | # else |
484 | movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax |
485 | movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx |
486 | subl %edx, %eax |
487 | # endif |
488 | # endif |
489 | VZEROUPPER |
490 | ret |
491 | |
492 | .p2align 4 |
493 | L(loop_cross_page): |
494 | xorl %r10d, %r10d |
495 | movq %rdx, %rcx |
496 | /* Align load via RDX. We load the extra ECX bytes which should |
497 | be ignored. */ |
498 | andl $((VEC_SIZE * 4) - 1), %ecx |
499 | /* R10 is -RCX. */ |
500 | subq %rcx, %r10 |
501 | |
502 | /* This works only if VEC_SIZE * 2 == 64. */ |
503 | # if (VEC_SIZE * 2) != 64 |
504 | # error (VEC_SIZE * 2) != 64 |
505 | # endif |
506 | |
507 | /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ |
508 | cmpl $(VEC_SIZE * 2), %ecx |
509 | jge L(loop_cross_page_2_vec) |
510 | |
511 | vmovdqu (%rax, %r10), %ymm2 |
512 | vmovdqu VEC_SIZE(%rax, %r10), %ymm3 |
513 | VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 |
514 | VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 |
515 | VPMINU %ymm2, %ymm0, %ymm0 |
516 | VPMINU %ymm3, %ymm1, %ymm1 |
517 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
518 | VPCMPEQ %ymm7, %ymm1, %ymm1 |
519 | |
520 | vpmovmskb %ymm0, %edi |
521 | vpmovmskb %ymm1, %esi |
522 | |
523 | salq $32, %rsi |
524 | xorq %rsi, %rdi |
525 | |
526 | /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ |
527 | shrq %cl, %rdi |
528 | |
529 | testq %rdi, %rdi |
530 | je L(loop_cross_page_2_vec) |
531 | tzcntq %rdi, %rcx |
532 | # ifdef USE_AS_STRNCMP |
533 | cmpq %rcx, %r11 |
534 | jbe L(zero) |
535 | # ifdef USE_AS_WCSCMP |
536 | movq %rax, %rsi |
537 | xorl %eax, %eax |
538 | movl (%rsi, %rcx), %edi |
539 | cmpl (%rdx, %rcx), %edi |
540 | jne L(wcscmp_return) |
541 | # else |
542 | movzbl (%rax, %rcx), %eax |
543 | movzbl (%rdx, %rcx), %edx |
544 | subl %edx, %eax |
545 | # endif |
546 | # else |
547 | # ifdef USE_AS_WCSCMP |
548 | movq %rax, %rsi |
549 | xorl %eax, %eax |
550 | movl (%rsi, %rcx), %edi |
551 | cmpl (%rdx, %rcx), %edi |
552 | jne L(wcscmp_return) |
553 | # else |
554 | movzbl (%rax, %rcx), %eax |
555 | movzbl (%rdx, %rcx), %edx |
556 | subl %edx, %eax |
557 | # endif |
558 | # endif |
559 | VZEROUPPER |
560 | ret |
561 | |
562 | .p2align 4 |
563 | L(loop_cross_page_2_vec): |
564 | /* The first VEC_SIZE * 2 bytes match or are ignored. */ |
565 | vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 |
566 | vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 |
567 | VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 |
568 | VPMINU %ymm2, %ymm5, %ymm5 |
569 | VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 |
570 | VPCMPEQ %ymm7, %ymm5, %ymm5 |
571 | VPMINU %ymm3, %ymm6, %ymm6 |
572 | VPCMPEQ %ymm7, %ymm6, %ymm6 |
573 | |
574 | vpmovmskb %ymm5, %edi |
575 | vpmovmskb %ymm6, %esi |
576 | |
577 | salq $32, %rsi |
578 | xorq %rsi, %rdi |
579 | |
580 | xorl %r8d, %r8d |
581 | /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ |
582 | subl $(VEC_SIZE * 2), %ecx |
583 | jle 1f |
584 | /* Skip ECX bytes. */ |
585 | shrq %cl, %rdi |
586 | /* R8 has number of bytes skipped. */ |
587 | movl %ecx, %r8d |
588 | 1: |
589 | /* Before jumping back to the loop, set ESI to the number of |
590 | VEC_SIZE * 4 blocks before page crossing. */ |
591 | movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi |
592 | |
593 | testq %rdi, %rdi |
594 | # ifdef USE_AS_STRNCMP |
595 | /* At this point, if %rdi value is 0, it already tested |
596 | VEC_SIZE*4+%r10 byte starting from %rax. This label |
597 | checks whether strncmp maximum offset reached or not. */ |
598 | je L(string_nbyte_offset_check) |
599 | # else |
600 | je L(back_to_loop) |
601 | # endif |
602 | tzcntq %rdi, %rcx |
603 | addq %r10, %rcx |
604 | /* Adjust for number of bytes skipped. */ |
605 | addq %r8, %rcx |
606 | # ifdef USE_AS_STRNCMP |
607 | addq $(VEC_SIZE * 2), %rcx |
608 | subq %rcx, %r11 |
609 | jbe L(zero) |
610 | # ifdef USE_AS_WCSCMP |
611 | movq %rax, %rsi |
612 | xorl %eax, %eax |
613 | movl (%rsi, %rcx), %edi |
614 | cmpl (%rdx, %rcx), %edi |
615 | jne L(wcscmp_return) |
616 | # else |
617 | movzbl (%rax, %rcx), %eax |
618 | movzbl (%rdx, %rcx), %edx |
619 | subl %edx, %eax |
620 | # endif |
621 | # else |
622 | # ifdef USE_AS_WCSCMP |
623 | movq %rax, %rsi |
624 | xorl %eax, %eax |
625 | movl (VEC_SIZE * 2)(%rsi, %rcx), %edi |
626 | cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi |
627 | jne L(wcscmp_return) |
628 | # else |
629 | movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax |
630 | movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx |
631 | subl %edx, %eax |
632 | # endif |
633 | # endif |
634 | VZEROUPPER |
635 | ret |
636 | |
637 | # ifdef USE_AS_STRNCMP |
638 | L(string_nbyte_offset_check): |
639 | leaq (VEC_SIZE * 4)(%r10), %r10 |
640 | cmpq %r10, %r11 |
641 | jbe L(zero) |
642 | jmp L(back_to_loop) |
643 | # endif |
644 | |
645 | .p2align 4 |
646 | L(cross_page_loop): |
647 | /* Check one byte/dword at a time. */ |
648 | # ifdef USE_AS_WCSCMP |
649 | cmpl %ecx, %eax |
650 | # else |
651 | subl %ecx, %eax |
652 | # endif |
653 | jne L(different) |
654 | addl $SIZE_OF_CHAR, %edx |
655 | cmpl $(VEC_SIZE * 4), %edx |
656 | je L(main_loop_header) |
657 | # ifdef USE_AS_STRNCMP |
658 | cmpq %r11, %rdx |
659 | jae L(zero) |
660 | # endif |
661 | # ifdef USE_AS_WCSCMP |
662 | movl (%rdi, %rdx), %eax |
663 | movl (%rsi, %rdx), %ecx |
664 | # else |
665 | movzbl (%rdi, %rdx), %eax |
666 | movzbl (%rsi, %rdx), %ecx |
667 | # endif |
668 | /* Check null char. */ |
669 | testl %eax, %eax |
670 | jne L(cross_page_loop) |
671 | /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED |
672 | comparisons. */ |
673 | subl %ecx, %eax |
674 | # ifndef USE_AS_WCSCMP |
675 | L(different): |
676 | # endif |
677 | VZEROUPPER |
678 | ret |
679 | |
680 | # ifdef USE_AS_WCSCMP |
681 | .p2align 4 |
682 | L(different): |
683 | /* Use movl to avoid modifying EFLAGS. */ |
684 | movl $0, %eax |
685 | setl %al |
686 | negl %eax |
687 | orl $1, %eax |
688 | VZEROUPPER |
689 | ret |
690 | # endif |
691 | |
692 | # ifdef USE_AS_STRNCMP |
693 | .p2align 4 |
694 | L(zero): |
695 | xorl %eax, %eax |
696 | VZEROUPPER |
697 | ret |
698 | |
699 | .p2align 4 |
700 | L(char0): |
701 | # ifdef USE_AS_WCSCMP |
702 | xorl %eax, %eax |
703 | movl (%rdi), %ecx |
704 | cmpl (%rsi), %ecx |
705 | jne L(wcscmp_return) |
706 | # else |
707 | movzbl (%rsi), %ecx |
708 | movzbl (%rdi), %eax |
709 | subl %ecx, %eax |
710 | # endif |
711 | VZEROUPPER |
712 | ret |
713 | # endif |
714 | |
715 | .p2align 4 |
716 | L(last_vector): |
717 | addq %rdx, %rdi |
718 | addq %rdx, %rsi |
719 | # ifdef USE_AS_STRNCMP |
720 | subq %rdx, %r11 |
721 | # endif |
722 | tzcntl %ecx, %edx |
723 | # ifdef USE_AS_STRNCMP |
724 | cmpq %r11, %rdx |
725 | jae L(zero) |
726 | # endif |
727 | # ifdef USE_AS_WCSCMP |
728 | xorl %eax, %eax |
729 | movl (%rdi, %rdx), %ecx |
730 | cmpl (%rsi, %rdx), %ecx |
731 | jne L(wcscmp_return) |
732 | # else |
733 | movzbl (%rdi, %rdx), %eax |
734 | movzbl (%rsi, %rdx), %edx |
735 | subl %edx, %eax |
736 | # endif |
737 | VZEROUPPER |
738 | ret |
739 | |
740 | /* Comparing on page boundary region requires special treatment: |
741 | It must done one vector at the time, starting with the wider |
742 | ymm vector if possible, if not, with xmm. If fetching 16 bytes |
743 | (xmm) still passes the boundary, byte comparison must be done. |
744 | */ |
745 | .p2align 4 |
746 | L(cross_page): |
747 | /* Try one ymm vector at a time. */ |
748 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
749 | jg L(cross_page_1_vector) |
750 | L(loop_1_vector): |
751 | vmovdqu (%rdi, %rdx), %ymm1 |
752 | VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 |
753 | VPMINU %ymm1, %ymm0, %ymm0 |
754 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
755 | vpmovmskb %ymm0, %ecx |
756 | testl %ecx, %ecx |
757 | jne L(last_vector) |
758 | |
759 | addl $VEC_SIZE, %edx |
760 | |
761 | addl $VEC_SIZE, %eax |
762 | # ifdef USE_AS_STRNCMP |
763 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
764 | (%r11). */ |
765 | cmpq %r11, %rdx |
766 | jae L(zero) |
767 | # endif |
768 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
769 | jle L(loop_1_vector) |
770 | L(cross_page_1_vector): |
771 | /* Less than 32 bytes to check, try one xmm vector. */ |
772 | cmpl $(PAGE_SIZE - 16), %eax |
773 | jg L(cross_page_1_xmm) |
774 | vmovdqu (%rdi, %rdx), %xmm1 |
775 | VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 |
776 | VPMINU %xmm1, %xmm0, %xmm0 |
777 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
778 | vpmovmskb %xmm0, %ecx |
779 | testl %ecx, %ecx |
780 | jne L(last_vector) |
781 | |
782 | addl $16, %edx |
783 | # ifndef USE_AS_WCSCMP |
784 | addl $16, %eax |
785 | # endif |
786 | # ifdef USE_AS_STRNCMP |
787 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
788 | (%r11). */ |
789 | cmpq %r11, %rdx |
790 | jae L(zero) |
791 | # endif |
792 | |
793 | L(cross_page_1_xmm): |
794 | # ifndef USE_AS_WCSCMP |
795 | /* Less than 16 bytes to check, try 8 byte vector. NB: No need |
796 | for wcscmp nor wcsncmp since wide char is 4 bytes. */ |
797 | cmpl $(PAGE_SIZE - 8), %eax |
798 | jg L(cross_page_8bytes) |
799 | vmovq (%rdi, %rdx), %xmm1 |
800 | vmovq (%rsi, %rdx), %xmm0 |
801 | VPCMPEQ %xmm0, %xmm1, %xmm0 |
802 | VPMINU %xmm1, %xmm0, %xmm0 |
803 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
804 | vpmovmskb %xmm0, %ecx |
805 | /* Only last 8 bits are valid. */ |
806 | andl $0xff, %ecx |
807 | testl %ecx, %ecx |
808 | jne L(last_vector) |
809 | |
810 | addl $8, %edx |
811 | addl $8, %eax |
812 | # ifdef USE_AS_STRNCMP |
813 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
814 | (%r11). */ |
815 | cmpq %r11, %rdx |
816 | jae L(zero) |
817 | # endif |
818 | |
819 | L(cross_page_8bytes): |
820 | /* Less than 8 bytes to check, try 4 byte vector. */ |
821 | cmpl $(PAGE_SIZE - 4), %eax |
822 | jg L(cross_page_4bytes) |
823 | vmovd (%rdi, %rdx), %xmm1 |
824 | vmovd (%rsi, %rdx), %xmm0 |
825 | VPCMPEQ %xmm0, %xmm1, %xmm0 |
826 | VPMINU %xmm1, %xmm0, %xmm0 |
827 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
828 | vpmovmskb %xmm0, %ecx |
829 | /* Only last 4 bits are valid. */ |
830 | andl $0xf, %ecx |
831 | testl %ecx, %ecx |
832 | jne L(last_vector) |
833 | |
834 | addl $4, %edx |
835 | # ifdef USE_AS_STRNCMP |
836 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
837 | (%r11). */ |
838 | cmpq %r11, %rdx |
839 | jae L(zero) |
840 | # endif |
841 | |
842 | L(cross_page_4bytes): |
843 | # endif |
844 | /* Less than 4 bytes to check, try one byte/dword at a time. */ |
845 | # ifdef USE_AS_STRNCMP |
846 | cmpq %r11, %rdx |
847 | jae L(zero) |
848 | # endif |
849 | # ifdef USE_AS_WCSCMP |
850 | movl (%rdi, %rdx), %eax |
851 | movl (%rsi, %rdx), %ecx |
852 | # else |
853 | movzbl (%rdi, %rdx), %eax |
854 | movzbl (%rsi, %rdx), %ecx |
855 | # endif |
856 | testl %eax, %eax |
857 | jne L(cross_page_loop) |
858 | subl %ecx, %eax |
859 | VZEROUPPER |
860 | ret |
861 | END (STRCMP) |
862 | #endif |
863 | |