1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. |
2 | Copyright (C) 2018-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRCMP |
24 | # define STRCMP __strcmp_avx2 |
25 | # endif |
26 | |
27 | # define PAGE_SIZE 4096 |
28 | |
29 | /* VEC_SIZE = Number of bytes in a ymm register */ |
30 | # define VEC_SIZE 32 |
31 | |
32 | /* Shift for dividing by (VEC_SIZE * 4). */ |
33 | # define DIVIDE_BY_VEC_4_SHIFT 7 |
34 | # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
35 | # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
36 | # endif |
37 | |
38 | # ifdef USE_AS_WCSCMP |
39 | /* Compare packed dwords. */ |
40 | # define VPCMPEQ vpcmpeqd |
41 | /* Compare packed dwords and store minimum. */ |
42 | # define VPMINU vpminud |
43 | /* 1 dword char == 4 bytes. */ |
44 | # define SIZE_OF_CHAR 4 |
45 | # else |
46 | /* Compare packed bytes. */ |
47 | # define VPCMPEQ vpcmpeqb |
48 | /* Compare packed bytes and store minimum. */ |
49 | # define VPMINU vpminub |
50 | /* 1 byte char == 1 byte. */ |
51 | # define SIZE_OF_CHAR 1 |
52 | # endif |
53 | |
54 | # ifndef VZEROUPPER |
55 | # define VZEROUPPER vzeroupper |
56 | # endif |
57 | |
58 | # ifndef SECTION |
59 | # define SECTION(p) p##.avx |
60 | # endif |
61 | |
62 | /* Warning! |
63 | wcscmp/wcsncmp have to use SIGNED comparison for elements. |
64 | strcmp/strncmp have to use UNSIGNED comparison for elements. |
65 | */ |
66 | |
67 | /* The main idea of the string comparison (byte or dword) using AVX2 |
68 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on |
69 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order |
70 | to check the null char, algorithm keeps the matched bytes/dwords, |
71 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, |
72 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and |
73 | one VPMINU instructions, together with movdqu and testl instructions. |
74 | Main loop (away from from page boundary) compares 4 vectors are a time, |
75 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. |
76 | |
77 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
78 | is the same as strcmp, except that an a maximum offset is tracked. If |
79 | the maximum offset is reached before a difference is found, zero is |
80 | returned. */ |
81 | |
82 | .section SECTION(.text),"ax" ,@progbits |
83 | ENTRY (STRCMP) |
84 | # ifdef USE_AS_STRNCMP |
85 | /* Check for simple cases (0 or 1) in offset. */ |
86 | cmp $1, %RDX_LP |
87 | je L(char0) |
88 | jb L(zero) |
89 | # ifdef USE_AS_WCSCMP |
90 | /* Convert units: from wide to byte char. */ |
91 | shl $2, %RDX_LP |
92 | # endif |
93 | /* Register %r11 tracks the maximum offset. */ |
94 | mov %RDX_LP, %R11_LP |
95 | # endif |
96 | movl %edi, %eax |
97 | xorl %edx, %edx |
98 | /* Make %xmm7 (%ymm7) all zeros in this function. */ |
99 | vpxor %xmm7, %xmm7, %xmm7 |
100 | orl %esi, %eax |
101 | andl $(PAGE_SIZE - 1), %eax |
102 | cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax |
103 | jg L(cross_page) |
104 | /* Start comparing 4 vectors. */ |
105 | vmovdqu (%rdi), %ymm1 |
106 | VPCMPEQ (%rsi), %ymm1, %ymm0 |
107 | VPMINU %ymm1, %ymm0, %ymm0 |
108 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
109 | vpmovmskb %ymm0, %ecx |
110 | testl %ecx, %ecx |
111 | je L(next_3_vectors) |
112 | tzcntl %ecx, %edx |
113 | # ifdef USE_AS_STRNCMP |
114 | /* Return 0 if the mismatched index (%rdx) is after the maximum |
115 | offset (%r11). */ |
116 | cmpq %r11, %rdx |
117 | jae L(zero) |
118 | # endif |
119 | # ifdef USE_AS_WCSCMP |
120 | xorl %eax, %eax |
121 | movl (%rdi, %rdx), %ecx |
122 | cmpl (%rsi, %rdx), %ecx |
123 | je L(return) |
124 | L(wcscmp_return): |
125 | setl %al |
126 | negl %eax |
127 | orl $1, %eax |
128 | L(return): |
129 | # else |
130 | movzbl (%rdi, %rdx), %eax |
131 | movzbl (%rsi, %rdx), %edx |
132 | subl %edx, %eax |
133 | # endif |
134 | L(return_vzeroupper): |
135 | ZERO_UPPER_VEC_REGISTERS_RETURN |
136 | |
137 | .p2align 4 |
138 | L(return_vec_size): |
139 | tzcntl %ecx, %edx |
140 | # ifdef USE_AS_STRNCMP |
141 | /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after |
142 | the maximum offset (%r11). */ |
143 | addq $VEC_SIZE, %rdx |
144 | cmpq %r11, %rdx |
145 | jae L(zero) |
146 | # ifdef USE_AS_WCSCMP |
147 | xorl %eax, %eax |
148 | movl (%rdi, %rdx), %ecx |
149 | cmpl (%rsi, %rdx), %ecx |
150 | jne L(wcscmp_return) |
151 | # else |
152 | movzbl (%rdi, %rdx), %eax |
153 | movzbl (%rsi, %rdx), %edx |
154 | subl %edx, %eax |
155 | # endif |
156 | # else |
157 | # ifdef USE_AS_WCSCMP |
158 | xorl %eax, %eax |
159 | movl VEC_SIZE(%rdi, %rdx), %ecx |
160 | cmpl VEC_SIZE(%rsi, %rdx), %ecx |
161 | jne L(wcscmp_return) |
162 | # else |
163 | movzbl VEC_SIZE(%rdi, %rdx), %eax |
164 | movzbl VEC_SIZE(%rsi, %rdx), %edx |
165 | subl %edx, %eax |
166 | # endif |
167 | # endif |
168 | VZEROUPPER_RETURN |
169 | |
170 | .p2align 4 |
171 | L(return_2_vec_size): |
172 | tzcntl %ecx, %edx |
173 | # ifdef USE_AS_STRNCMP |
174 | /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is |
175 | after the maximum offset (%r11). */ |
176 | addq $(VEC_SIZE * 2), %rdx |
177 | cmpq %r11, %rdx |
178 | jae L(zero) |
179 | # ifdef USE_AS_WCSCMP |
180 | xorl %eax, %eax |
181 | movl (%rdi, %rdx), %ecx |
182 | cmpl (%rsi, %rdx), %ecx |
183 | jne L(wcscmp_return) |
184 | # else |
185 | movzbl (%rdi, %rdx), %eax |
186 | movzbl (%rsi, %rdx), %edx |
187 | subl %edx, %eax |
188 | # endif |
189 | # else |
190 | # ifdef USE_AS_WCSCMP |
191 | xorl %eax, %eax |
192 | movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx |
193 | cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx |
194 | jne L(wcscmp_return) |
195 | # else |
196 | movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax |
197 | movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx |
198 | subl %edx, %eax |
199 | # endif |
200 | # endif |
201 | VZEROUPPER_RETURN |
202 | |
203 | .p2align 4 |
204 | L(return_3_vec_size): |
205 | tzcntl %ecx, %edx |
206 | # ifdef USE_AS_STRNCMP |
207 | /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is |
208 | after the maximum offset (%r11). */ |
209 | addq $(VEC_SIZE * 3), %rdx |
210 | cmpq %r11, %rdx |
211 | jae L(zero) |
212 | # ifdef USE_AS_WCSCMP |
213 | xorl %eax, %eax |
214 | movl (%rdi, %rdx), %ecx |
215 | cmpl (%rsi, %rdx), %ecx |
216 | jne L(wcscmp_return) |
217 | # else |
218 | movzbl (%rdi, %rdx), %eax |
219 | movzbl (%rsi, %rdx), %edx |
220 | subl %edx, %eax |
221 | # endif |
222 | # else |
223 | # ifdef USE_AS_WCSCMP |
224 | xorl %eax, %eax |
225 | movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx |
226 | cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx |
227 | jne L(wcscmp_return) |
228 | # else |
229 | movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax |
230 | movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx |
231 | subl %edx, %eax |
232 | # endif |
233 | # endif |
234 | VZEROUPPER_RETURN |
235 | |
236 | .p2align 4 |
237 | L(next_3_vectors): |
238 | vmovdqu VEC_SIZE(%rdi), %ymm6 |
239 | VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 |
240 | VPMINU %ymm6, %ymm3, %ymm3 |
241 | VPCMPEQ %ymm7, %ymm3, %ymm3 |
242 | vpmovmskb %ymm3, %ecx |
243 | testl %ecx, %ecx |
244 | jne L(return_vec_size) |
245 | vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 |
246 | vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 |
247 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 |
248 | VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 |
249 | VPMINU %ymm5, %ymm2, %ymm2 |
250 | VPCMPEQ %ymm4, %ymm0, %ymm0 |
251 | VPCMPEQ %ymm7, %ymm2, %ymm2 |
252 | vpmovmskb %ymm2, %ecx |
253 | testl %ecx, %ecx |
254 | jne L(return_2_vec_size) |
255 | VPMINU %ymm4, %ymm0, %ymm0 |
256 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
257 | vpmovmskb %ymm0, %ecx |
258 | testl %ecx, %ecx |
259 | jne L(return_3_vec_size) |
260 | L(main_loop_header): |
261 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
262 | movl $PAGE_SIZE, %ecx |
263 | /* Align load via RAX. */ |
264 | andq $-(VEC_SIZE * 4), %rdx |
265 | subq %rdi, %rdx |
266 | leaq (%rdi, %rdx), %rax |
267 | # ifdef USE_AS_STRNCMP |
268 | /* Starting from this point, the maximum offset, or simply the |
269 | 'offset', DECREASES by the same amount when base pointers are |
270 | moved forward. Return 0 when: |
271 | 1) On match: offset <= the matched vector index. |
272 | 2) On mistmach, offset is before the mistmatched index. |
273 | */ |
274 | subq %rdx, %r11 |
275 | jbe L(zero) |
276 | # endif |
277 | addq %rsi, %rdx |
278 | movq %rdx, %rsi |
279 | andl $(PAGE_SIZE - 1), %esi |
280 | /* Number of bytes before page crossing. */ |
281 | subq %rsi, %rcx |
282 | /* Number of VEC_SIZE * 4 blocks before page crossing. */ |
283 | shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx |
284 | /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ |
285 | movl %ecx, %esi |
286 | jmp L(loop_start) |
287 | |
288 | .p2align 4 |
289 | L(loop): |
290 | # ifdef USE_AS_STRNCMP |
291 | /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease |
292 | the maximum offset (%r11) by the same amount. */ |
293 | subq $(VEC_SIZE * 4), %r11 |
294 | jbe L(zero) |
295 | # endif |
296 | addq $(VEC_SIZE * 4), %rax |
297 | addq $(VEC_SIZE * 4), %rdx |
298 | L(loop_start): |
299 | testl %esi, %esi |
300 | leal -1(%esi), %esi |
301 | je L(loop_cross_page) |
302 | L(back_to_loop): |
303 | /* Main loop, comparing 4 vectors are a time. */ |
304 | vmovdqa (%rax), %ymm0 |
305 | vmovdqa VEC_SIZE(%rax), %ymm3 |
306 | VPCMPEQ (%rdx), %ymm0, %ymm4 |
307 | VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 |
308 | VPMINU %ymm0, %ymm4, %ymm4 |
309 | VPMINU %ymm3, %ymm1, %ymm1 |
310 | vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 |
311 | VPMINU %ymm1, %ymm4, %ymm0 |
312 | vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 |
313 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 |
314 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 |
315 | VPMINU %ymm2, %ymm5, %ymm5 |
316 | VPMINU %ymm3, %ymm6, %ymm6 |
317 | VPMINU %ymm5, %ymm0, %ymm0 |
318 | VPMINU %ymm6, %ymm0, %ymm0 |
319 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
320 | |
321 | /* Test each mask (32 bits) individually because for VEC_SIZE |
322 | == 32 is not possible to OR the four masks and keep all bits |
323 | in a 64-bit integer register, differing from SSE2 strcmp |
324 | where ORing is possible. */ |
325 | vpmovmskb %ymm0, %ecx |
326 | testl %ecx, %ecx |
327 | je L(loop) |
328 | VPCMPEQ %ymm7, %ymm4, %ymm0 |
329 | vpmovmskb %ymm0, %edi |
330 | testl %edi, %edi |
331 | je L(test_vec) |
332 | tzcntl %edi, %ecx |
333 | # ifdef USE_AS_STRNCMP |
334 | cmpq %rcx, %r11 |
335 | jbe L(zero) |
336 | # ifdef USE_AS_WCSCMP |
337 | movq %rax, %rsi |
338 | xorl %eax, %eax |
339 | movl (%rsi, %rcx), %edi |
340 | cmpl (%rdx, %rcx), %edi |
341 | jne L(wcscmp_return) |
342 | # else |
343 | movzbl (%rax, %rcx), %eax |
344 | movzbl (%rdx, %rcx), %edx |
345 | subl %edx, %eax |
346 | # endif |
347 | # else |
348 | # ifdef USE_AS_WCSCMP |
349 | movq %rax, %rsi |
350 | xorl %eax, %eax |
351 | movl (%rsi, %rcx), %edi |
352 | cmpl (%rdx, %rcx), %edi |
353 | jne L(wcscmp_return) |
354 | # else |
355 | movzbl (%rax, %rcx), %eax |
356 | movzbl (%rdx, %rcx), %edx |
357 | subl %edx, %eax |
358 | # endif |
359 | # endif |
360 | VZEROUPPER_RETURN |
361 | |
362 | .p2align 4 |
363 | L(test_vec): |
364 | # ifdef USE_AS_STRNCMP |
365 | /* The first vector matched. Return 0 if the maximum offset |
366 | (%r11) <= VEC_SIZE. */ |
367 | cmpq $VEC_SIZE, %r11 |
368 | jbe L(zero) |
369 | # endif |
370 | VPCMPEQ %ymm7, %ymm1, %ymm1 |
371 | vpmovmskb %ymm1, %ecx |
372 | testl %ecx, %ecx |
373 | je L(test_2_vec) |
374 | tzcntl %ecx, %edi |
375 | # ifdef USE_AS_STRNCMP |
376 | addq $VEC_SIZE, %rdi |
377 | cmpq %rdi, %r11 |
378 | jbe L(zero) |
379 | # ifdef USE_AS_WCSCMP |
380 | movq %rax, %rsi |
381 | xorl %eax, %eax |
382 | movl (%rsi, %rdi), %ecx |
383 | cmpl (%rdx, %rdi), %ecx |
384 | jne L(wcscmp_return) |
385 | # else |
386 | movzbl (%rax, %rdi), %eax |
387 | movzbl (%rdx, %rdi), %edx |
388 | subl %edx, %eax |
389 | # endif |
390 | # else |
391 | # ifdef USE_AS_WCSCMP |
392 | movq %rax, %rsi |
393 | xorl %eax, %eax |
394 | movl VEC_SIZE(%rsi, %rdi), %ecx |
395 | cmpl VEC_SIZE(%rdx, %rdi), %ecx |
396 | jne L(wcscmp_return) |
397 | # else |
398 | movzbl VEC_SIZE(%rax, %rdi), %eax |
399 | movzbl VEC_SIZE(%rdx, %rdi), %edx |
400 | subl %edx, %eax |
401 | # endif |
402 | # endif |
403 | VZEROUPPER_RETURN |
404 | |
405 | .p2align 4 |
406 | L(test_2_vec): |
407 | # ifdef USE_AS_STRNCMP |
408 | /* The first 2 vectors matched. Return 0 if the maximum offset |
409 | (%r11) <= 2 * VEC_SIZE. */ |
410 | cmpq $(VEC_SIZE * 2), %r11 |
411 | jbe L(zero) |
412 | # endif |
413 | VPCMPEQ %ymm7, %ymm5, %ymm5 |
414 | vpmovmskb %ymm5, %ecx |
415 | testl %ecx, %ecx |
416 | je L(test_3_vec) |
417 | tzcntl %ecx, %edi |
418 | # ifdef USE_AS_STRNCMP |
419 | addq $(VEC_SIZE * 2), %rdi |
420 | cmpq %rdi, %r11 |
421 | jbe L(zero) |
422 | # ifdef USE_AS_WCSCMP |
423 | movq %rax, %rsi |
424 | xorl %eax, %eax |
425 | movl (%rsi, %rdi), %ecx |
426 | cmpl (%rdx, %rdi), %ecx |
427 | jne L(wcscmp_return) |
428 | # else |
429 | movzbl (%rax, %rdi), %eax |
430 | movzbl (%rdx, %rdi), %edx |
431 | subl %edx, %eax |
432 | # endif |
433 | # else |
434 | # ifdef USE_AS_WCSCMP |
435 | movq %rax, %rsi |
436 | xorl %eax, %eax |
437 | movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx |
438 | cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx |
439 | jne L(wcscmp_return) |
440 | # else |
441 | movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax |
442 | movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx |
443 | subl %edx, %eax |
444 | # endif |
445 | # endif |
446 | VZEROUPPER_RETURN |
447 | |
448 | .p2align 4 |
449 | L(test_3_vec): |
450 | # ifdef USE_AS_STRNCMP |
451 | /* The first 3 vectors matched. Return 0 if the maximum offset |
452 | (%r11) <= 3 * VEC_SIZE. */ |
453 | cmpq $(VEC_SIZE * 3), %r11 |
454 | jbe L(zero) |
455 | # endif |
456 | VPCMPEQ %ymm7, %ymm6, %ymm6 |
457 | vpmovmskb %ymm6, %esi |
458 | tzcntl %esi, %ecx |
459 | # ifdef USE_AS_STRNCMP |
460 | addq $(VEC_SIZE * 3), %rcx |
461 | cmpq %rcx, %r11 |
462 | jbe L(zero) |
463 | # ifdef USE_AS_WCSCMP |
464 | movq %rax, %rsi |
465 | xorl %eax, %eax |
466 | movl (%rsi, %rcx), %esi |
467 | cmpl (%rdx, %rcx), %esi |
468 | jne L(wcscmp_return) |
469 | # else |
470 | movzbl (%rax, %rcx), %eax |
471 | movzbl (%rdx, %rcx), %edx |
472 | subl %edx, %eax |
473 | # endif |
474 | # else |
475 | # ifdef USE_AS_WCSCMP |
476 | movq %rax, %rsi |
477 | xorl %eax, %eax |
478 | movl (VEC_SIZE * 3)(%rsi, %rcx), %esi |
479 | cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi |
480 | jne L(wcscmp_return) |
481 | # else |
482 | movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax |
483 | movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx |
484 | subl %edx, %eax |
485 | # endif |
486 | # endif |
487 | VZEROUPPER_RETURN |
488 | |
489 | .p2align 4 |
490 | L(loop_cross_page): |
491 | xorl %r10d, %r10d |
492 | movq %rdx, %rcx |
493 | /* Align load via RDX. We load the extra ECX bytes which should |
494 | be ignored. */ |
495 | andl $((VEC_SIZE * 4) - 1), %ecx |
496 | /* R10 is -RCX. */ |
497 | subq %rcx, %r10 |
498 | |
499 | /* This works only if VEC_SIZE * 2 == 64. */ |
500 | # if (VEC_SIZE * 2) != 64 |
501 | # error (VEC_SIZE * 2) != 64 |
502 | # endif |
503 | |
504 | /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ |
505 | cmpl $(VEC_SIZE * 2), %ecx |
506 | jge L(loop_cross_page_2_vec) |
507 | |
508 | vmovdqu (%rax, %r10), %ymm2 |
509 | vmovdqu VEC_SIZE(%rax, %r10), %ymm3 |
510 | VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 |
511 | VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 |
512 | VPMINU %ymm2, %ymm0, %ymm0 |
513 | VPMINU %ymm3, %ymm1, %ymm1 |
514 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
515 | VPCMPEQ %ymm7, %ymm1, %ymm1 |
516 | |
517 | vpmovmskb %ymm0, %edi |
518 | vpmovmskb %ymm1, %esi |
519 | |
520 | salq $32, %rsi |
521 | xorq %rsi, %rdi |
522 | |
523 | /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ |
524 | shrq %cl, %rdi |
525 | |
526 | testq %rdi, %rdi |
527 | je L(loop_cross_page_2_vec) |
528 | tzcntq %rdi, %rcx |
529 | # ifdef USE_AS_STRNCMP |
530 | cmpq %rcx, %r11 |
531 | jbe L(zero) |
532 | # ifdef USE_AS_WCSCMP |
533 | movq %rax, %rsi |
534 | xorl %eax, %eax |
535 | movl (%rsi, %rcx), %edi |
536 | cmpl (%rdx, %rcx), %edi |
537 | jne L(wcscmp_return) |
538 | # else |
539 | movzbl (%rax, %rcx), %eax |
540 | movzbl (%rdx, %rcx), %edx |
541 | subl %edx, %eax |
542 | # endif |
543 | # else |
544 | # ifdef USE_AS_WCSCMP |
545 | movq %rax, %rsi |
546 | xorl %eax, %eax |
547 | movl (%rsi, %rcx), %edi |
548 | cmpl (%rdx, %rcx), %edi |
549 | jne L(wcscmp_return) |
550 | # else |
551 | movzbl (%rax, %rcx), %eax |
552 | movzbl (%rdx, %rcx), %edx |
553 | subl %edx, %eax |
554 | # endif |
555 | # endif |
556 | VZEROUPPER_RETURN |
557 | |
558 | .p2align 4 |
559 | L(loop_cross_page_2_vec): |
560 | /* The first VEC_SIZE * 2 bytes match or are ignored. */ |
561 | vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 |
562 | vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 |
563 | VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 |
564 | VPMINU %ymm2, %ymm5, %ymm5 |
565 | VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 |
566 | VPCMPEQ %ymm7, %ymm5, %ymm5 |
567 | VPMINU %ymm3, %ymm6, %ymm6 |
568 | VPCMPEQ %ymm7, %ymm6, %ymm6 |
569 | |
570 | vpmovmskb %ymm5, %edi |
571 | vpmovmskb %ymm6, %esi |
572 | |
573 | salq $32, %rsi |
574 | xorq %rsi, %rdi |
575 | |
576 | xorl %r8d, %r8d |
577 | /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ |
578 | subl $(VEC_SIZE * 2), %ecx |
579 | jle 1f |
580 | /* Skip ECX bytes. */ |
581 | shrq %cl, %rdi |
582 | /* R8 has number of bytes skipped. */ |
583 | movl %ecx, %r8d |
584 | 1: |
585 | /* Before jumping back to the loop, set ESI to the number of |
586 | VEC_SIZE * 4 blocks before page crossing. */ |
587 | movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi |
588 | |
589 | testq %rdi, %rdi |
590 | # ifdef USE_AS_STRNCMP |
591 | /* At this point, if %rdi value is 0, it already tested |
592 | VEC_SIZE*4+%r10 byte starting from %rax. This label |
593 | checks whether strncmp maximum offset reached or not. */ |
594 | je L(string_nbyte_offset_check) |
595 | # else |
596 | je L(back_to_loop) |
597 | # endif |
598 | tzcntq %rdi, %rcx |
599 | addq %r10, %rcx |
600 | /* Adjust for number of bytes skipped. */ |
601 | addq %r8, %rcx |
602 | # ifdef USE_AS_STRNCMP |
603 | addq $(VEC_SIZE * 2), %rcx |
604 | subq %rcx, %r11 |
605 | jbe L(zero) |
606 | # ifdef USE_AS_WCSCMP |
607 | movq %rax, %rsi |
608 | xorl %eax, %eax |
609 | movl (%rsi, %rcx), %edi |
610 | cmpl (%rdx, %rcx), %edi |
611 | jne L(wcscmp_return) |
612 | # else |
613 | movzbl (%rax, %rcx), %eax |
614 | movzbl (%rdx, %rcx), %edx |
615 | subl %edx, %eax |
616 | # endif |
617 | # else |
618 | # ifdef USE_AS_WCSCMP |
619 | movq %rax, %rsi |
620 | xorl %eax, %eax |
621 | movl (VEC_SIZE * 2)(%rsi, %rcx), %edi |
622 | cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi |
623 | jne L(wcscmp_return) |
624 | # else |
625 | movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax |
626 | movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx |
627 | subl %edx, %eax |
628 | # endif |
629 | # endif |
630 | VZEROUPPER_RETURN |
631 | |
632 | # ifdef USE_AS_STRNCMP |
633 | L(string_nbyte_offset_check): |
634 | leaq (VEC_SIZE * 4)(%r10), %r10 |
635 | cmpq %r10, %r11 |
636 | jbe L(zero) |
637 | jmp L(back_to_loop) |
638 | # endif |
639 | |
640 | .p2align 4 |
641 | L(cross_page_loop): |
642 | /* Check one byte/dword at a time. */ |
643 | # ifdef USE_AS_WCSCMP |
644 | cmpl %ecx, %eax |
645 | # else |
646 | subl %ecx, %eax |
647 | # endif |
648 | jne L(different) |
649 | addl $SIZE_OF_CHAR, %edx |
650 | cmpl $(VEC_SIZE * 4), %edx |
651 | je L(main_loop_header) |
652 | # ifdef USE_AS_STRNCMP |
653 | cmpq %r11, %rdx |
654 | jae L(zero) |
655 | # endif |
656 | # ifdef USE_AS_WCSCMP |
657 | movl (%rdi, %rdx), %eax |
658 | movl (%rsi, %rdx), %ecx |
659 | # else |
660 | movzbl (%rdi, %rdx), %eax |
661 | movzbl (%rsi, %rdx), %ecx |
662 | # endif |
663 | /* Check null char. */ |
664 | testl %eax, %eax |
665 | jne L(cross_page_loop) |
666 | /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED |
667 | comparisons. */ |
668 | subl %ecx, %eax |
669 | # ifndef USE_AS_WCSCMP |
670 | L(different): |
671 | # endif |
672 | VZEROUPPER_RETURN |
673 | |
674 | # ifdef USE_AS_WCSCMP |
675 | .p2align 4 |
676 | L(different): |
677 | /* Use movl to avoid modifying EFLAGS. */ |
678 | movl $0, %eax |
679 | setl %al |
680 | negl %eax |
681 | orl $1, %eax |
682 | VZEROUPPER_RETURN |
683 | # endif |
684 | |
685 | # ifdef USE_AS_STRNCMP |
686 | .p2align 4 |
687 | L(zero): |
688 | xorl %eax, %eax |
689 | VZEROUPPER_RETURN |
690 | |
691 | .p2align 4 |
692 | L(char0): |
693 | # ifdef USE_AS_WCSCMP |
694 | xorl %eax, %eax |
695 | movl (%rdi), %ecx |
696 | cmpl (%rsi), %ecx |
697 | jne L(wcscmp_return) |
698 | # else |
699 | movzbl (%rsi), %ecx |
700 | movzbl (%rdi), %eax |
701 | subl %ecx, %eax |
702 | # endif |
703 | VZEROUPPER_RETURN |
704 | # endif |
705 | |
706 | .p2align 4 |
707 | L(last_vector): |
708 | addq %rdx, %rdi |
709 | addq %rdx, %rsi |
710 | # ifdef USE_AS_STRNCMP |
711 | subq %rdx, %r11 |
712 | # endif |
713 | tzcntl %ecx, %edx |
714 | # ifdef USE_AS_STRNCMP |
715 | cmpq %r11, %rdx |
716 | jae L(zero) |
717 | # endif |
718 | # ifdef USE_AS_WCSCMP |
719 | xorl %eax, %eax |
720 | movl (%rdi, %rdx), %ecx |
721 | cmpl (%rsi, %rdx), %ecx |
722 | jne L(wcscmp_return) |
723 | # else |
724 | movzbl (%rdi, %rdx), %eax |
725 | movzbl (%rsi, %rdx), %edx |
726 | subl %edx, %eax |
727 | # endif |
728 | VZEROUPPER_RETURN |
729 | |
730 | /* Comparing on page boundary region requires special treatment: |
731 | It must done one vector at the time, starting with the wider |
732 | ymm vector if possible, if not, with xmm. If fetching 16 bytes |
733 | (xmm) still passes the boundary, byte comparison must be done. |
734 | */ |
735 | .p2align 4 |
736 | L(cross_page): |
737 | /* Try one ymm vector at a time. */ |
738 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
739 | jg L(cross_page_1_vector) |
740 | L(loop_1_vector): |
741 | vmovdqu (%rdi, %rdx), %ymm1 |
742 | VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 |
743 | VPMINU %ymm1, %ymm0, %ymm0 |
744 | VPCMPEQ %ymm7, %ymm0, %ymm0 |
745 | vpmovmskb %ymm0, %ecx |
746 | testl %ecx, %ecx |
747 | jne L(last_vector) |
748 | |
749 | addl $VEC_SIZE, %edx |
750 | |
751 | addl $VEC_SIZE, %eax |
752 | # ifdef USE_AS_STRNCMP |
753 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
754 | (%r11). */ |
755 | cmpq %r11, %rdx |
756 | jae L(zero) |
757 | # endif |
758 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
759 | jle L(loop_1_vector) |
760 | L(cross_page_1_vector): |
761 | /* Less than 32 bytes to check, try one xmm vector. */ |
762 | cmpl $(PAGE_SIZE - 16), %eax |
763 | jg L(cross_page_1_xmm) |
764 | vmovdqu (%rdi, %rdx), %xmm1 |
765 | VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 |
766 | VPMINU %xmm1, %xmm0, %xmm0 |
767 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
768 | vpmovmskb %xmm0, %ecx |
769 | testl %ecx, %ecx |
770 | jne L(last_vector) |
771 | |
772 | addl $16, %edx |
773 | # ifndef USE_AS_WCSCMP |
774 | addl $16, %eax |
775 | # endif |
776 | # ifdef USE_AS_STRNCMP |
777 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
778 | (%r11). */ |
779 | cmpq %r11, %rdx |
780 | jae L(zero) |
781 | # endif |
782 | |
783 | L(cross_page_1_xmm): |
784 | # ifndef USE_AS_WCSCMP |
785 | /* Less than 16 bytes to check, try 8 byte vector. NB: No need |
786 | for wcscmp nor wcsncmp since wide char is 4 bytes. */ |
787 | cmpl $(PAGE_SIZE - 8), %eax |
788 | jg L(cross_page_8bytes) |
789 | vmovq (%rdi, %rdx), %xmm1 |
790 | vmovq (%rsi, %rdx), %xmm0 |
791 | VPCMPEQ %xmm0, %xmm1, %xmm0 |
792 | VPMINU %xmm1, %xmm0, %xmm0 |
793 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
794 | vpmovmskb %xmm0, %ecx |
795 | /* Only last 8 bits are valid. */ |
796 | andl $0xff, %ecx |
797 | testl %ecx, %ecx |
798 | jne L(last_vector) |
799 | |
800 | addl $8, %edx |
801 | addl $8, %eax |
802 | # ifdef USE_AS_STRNCMP |
803 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
804 | (%r11). */ |
805 | cmpq %r11, %rdx |
806 | jae L(zero) |
807 | # endif |
808 | |
809 | L(cross_page_8bytes): |
810 | /* Less than 8 bytes to check, try 4 byte vector. */ |
811 | cmpl $(PAGE_SIZE - 4), %eax |
812 | jg L(cross_page_4bytes) |
813 | vmovd (%rdi, %rdx), %xmm1 |
814 | vmovd (%rsi, %rdx), %xmm0 |
815 | VPCMPEQ %xmm0, %xmm1, %xmm0 |
816 | VPMINU %xmm1, %xmm0, %xmm0 |
817 | VPCMPEQ %xmm7, %xmm0, %xmm0 |
818 | vpmovmskb %xmm0, %ecx |
819 | /* Only last 4 bits are valid. */ |
820 | andl $0xf, %ecx |
821 | testl %ecx, %ecx |
822 | jne L(last_vector) |
823 | |
824 | addl $4, %edx |
825 | # ifdef USE_AS_STRNCMP |
826 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
827 | (%r11). */ |
828 | cmpq %r11, %rdx |
829 | jae L(zero) |
830 | # endif |
831 | |
832 | L(cross_page_4bytes): |
833 | # endif |
834 | /* Less than 4 bytes to check, try one byte/dword at a time. */ |
835 | # ifdef USE_AS_STRNCMP |
836 | cmpq %r11, %rdx |
837 | jae L(zero) |
838 | # endif |
839 | # ifdef USE_AS_WCSCMP |
840 | movl (%rdi, %rdx), %eax |
841 | movl (%rsi, %rdx), %ecx |
842 | # else |
843 | movzbl (%rdi, %rdx), %eax |
844 | movzbl (%rsi, %rdx), %ecx |
845 | # endif |
846 | testl %eax, %eax |
847 | jne L(cross_page_loop) |
848 | subl %ecx, %eax |
849 | VZEROUPPER_RETURN |
850 | END (STRCMP) |
851 | #endif |
852 | |