1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRCMP |
24 | # define STRCMP __strcmp_evex |
25 | # endif |
26 | |
27 | # define PAGE_SIZE 4096 |
28 | |
29 | /* VEC_SIZE = Number of bytes in a ymm register */ |
30 | # define VEC_SIZE 32 |
31 | |
32 | /* Shift for dividing by (VEC_SIZE * 4). */ |
33 | # define DIVIDE_BY_VEC_4_SHIFT 7 |
34 | # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
35 | # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
36 | # endif |
37 | |
38 | # define VMOVU vmovdqu64 |
39 | # define VMOVA vmovdqa64 |
40 | |
41 | # ifdef USE_AS_WCSCMP |
42 | /* Compare packed dwords. */ |
43 | # define VPCMP vpcmpd |
44 | # define VPMINU vpminud |
45 | # define VPTESTM vptestmd |
46 | # define SHIFT_REG32 r8d |
47 | # define SHIFT_REG64 r8 |
48 | /* 1 dword char == 4 bytes. */ |
49 | # define SIZE_OF_CHAR 4 |
50 | # else |
51 | /* Compare packed bytes. */ |
52 | # define VPCMP vpcmpb |
53 | # define VPMINU vpminub |
54 | # define VPTESTM vptestmb |
55 | # define SHIFT_REG32 ecx |
56 | # define SHIFT_REG64 rcx |
57 | /* 1 byte char == 1 byte. */ |
58 | # define SIZE_OF_CHAR 1 |
59 | # endif |
60 | |
61 | # define XMMZERO xmm16 |
62 | # define XMM0 xmm17 |
63 | # define XMM1 xmm18 |
64 | |
65 | # define YMMZERO ymm16 |
66 | # define YMM0 ymm17 |
67 | # define YMM1 ymm18 |
68 | # define YMM2 ymm19 |
69 | # define YMM3 ymm20 |
70 | # define YMM4 ymm21 |
71 | # define YMM5 ymm22 |
72 | # define YMM6 ymm23 |
73 | # define YMM7 ymm24 |
74 | # define YMM8 ymm25 |
75 | # define YMM9 ymm26 |
76 | # define YMM10 ymm27 |
77 | |
78 | /* Warning! |
79 | wcscmp/wcsncmp have to use SIGNED comparison for elements. |
80 | strcmp/strncmp have to use UNSIGNED comparison for elements. |
81 | */ |
82 | |
83 | /* The main idea of the string comparison (byte or dword) using 256-bit |
84 | EVEX instructions consists of comparing (VPCMP) two ymm vectors. The |
85 | latter can be on either packed bytes or dwords depending on |
86 | USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the |
87 | matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 |
88 | KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) |
89 | are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd |
90 | instructions. Main loop (away from from page boundary) compares 4 |
91 | vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 |
92 | bytes) on each loop. |
93 | |
94 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
95 | is the same as strcmp, except that an a maximum offset is tracked. If |
96 | the maximum offset is reached before a difference is found, zero is |
97 | returned. */ |
98 | |
99 | .section .text.evex,"ax" ,@progbits |
100 | ENTRY (STRCMP) |
101 | # ifdef USE_AS_STRNCMP |
102 | /* Check for simple cases (0 or 1) in offset. */ |
103 | cmp $1, %RDX_LP |
104 | je L(char0) |
105 | jb L(zero) |
106 | # ifdef USE_AS_WCSCMP |
107 | # ifndef __ILP32__ |
108 | movq %rdx, %rcx |
109 | /* Check if length could overflow when multiplied by |
110 | sizeof(wchar_t). Checking top 8 bits will cover all potential |
111 | overflow cases as well as redirect cases where its impossible to |
112 | length to bound a valid memory region. In these cases just use |
113 | 'wcscmp'. */ |
114 | shrq $56, %rcx |
115 | jnz __wcscmp_evex |
116 | # endif |
117 | /* Convert units: from wide to byte char. */ |
118 | shl $2, %RDX_LP |
119 | # endif |
120 | /* Register %r11 tracks the maximum offset. */ |
121 | mov %RDX_LP, %R11_LP |
122 | # endif |
123 | movl %edi, %eax |
124 | xorl %edx, %edx |
125 | /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ |
126 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
127 | orl %esi, %eax |
128 | andl $(PAGE_SIZE - 1), %eax |
129 | cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax |
130 | jg L(cross_page) |
131 | /* Start comparing 4 vectors. */ |
132 | VMOVU (%rdi), %YMM0 |
133 | |
134 | /* Each bit set in K2 represents a non-null CHAR in YMM0. */ |
135 | VPTESTM %YMM0, %YMM0, %k2 |
136 | |
137 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
138 | in YMM0 and 32 bytes at (%rsi). */ |
139 | VPCMP $0, (%rsi), %YMM0, %k1{%k2} |
140 | |
141 | kmovd %k1, %ecx |
142 | # ifdef USE_AS_WCSCMP |
143 | subl $0xff, %ecx |
144 | # else |
145 | incl %ecx |
146 | # endif |
147 | je L(next_3_vectors) |
148 | tzcntl %ecx, %edx |
149 | # ifdef USE_AS_WCSCMP |
150 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
151 | sall $2, %edx |
152 | # endif |
153 | # ifdef USE_AS_STRNCMP |
154 | /* Return 0 if the mismatched index (%rdx) is after the maximum |
155 | offset (%r11). */ |
156 | cmpq %r11, %rdx |
157 | jae L(zero) |
158 | # endif |
159 | # ifdef USE_AS_WCSCMP |
160 | xorl %eax, %eax |
161 | movl (%rdi, %rdx), %ecx |
162 | cmpl (%rsi, %rdx), %ecx |
163 | je L(return) |
164 | L(wcscmp_return): |
165 | setl %al |
166 | negl %eax |
167 | orl $1, %eax |
168 | L(return): |
169 | # else |
170 | movzbl (%rdi, %rdx), %eax |
171 | movzbl (%rsi, %rdx), %edx |
172 | subl %edx, %eax |
173 | # endif |
174 | ret |
175 | |
176 | L(return_vec_size): |
177 | tzcntl %ecx, %edx |
178 | # ifdef USE_AS_WCSCMP |
179 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
180 | sall $2, %edx |
181 | # endif |
182 | # ifdef USE_AS_STRNCMP |
183 | /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after |
184 | the maximum offset (%r11). */ |
185 | addq $VEC_SIZE, %rdx |
186 | cmpq %r11, %rdx |
187 | jae L(zero) |
188 | # ifdef USE_AS_WCSCMP |
189 | xorl %eax, %eax |
190 | movl (%rdi, %rdx), %ecx |
191 | cmpl (%rsi, %rdx), %ecx |
192 | jne L(wcscmp_return) |
193 | # else |
194 | movzbl (%rdi, %rdx), %eax |
195 | movzbl (%rsi, %rdx), %edx |
196 | subl %edx, %eax |
197 | # endif |
198 | # else |
199 | # ifdef USE_AS_WCSCMP |
200 | xorl %eax, %eax |
201 | movl VEC_SIZE(%rdi, %rdx), %ecx |
202 | cmpl VEC_SIZE(%rsi, %rdx), %ecx |
203 | jne L(wcscmp_return) |
204 | # else |
205 | movzbl VEC_SIZE(%rdi, %rdx), %eax |
206 | movzbl VEC_SIZE(%rsi, %rdx), %edx |
207 | subl %edx, %eax |
208 | # endif |
209 | # endif |
210 | ret |
211 | |
212 | L(return_2_vec_size): |
213 | tzcntl %ecx, %edx |
214 | # ifdef USE_AS_WCSCMP |
215 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
216 | sall $2, %edx |
217 | # endif |
218 | # ifdef USE_AS_STRNCMP |
219 | /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is |
220 | after the maximum offset (%r11). */ |
221 | addq $(VEC_SIZE * 2), %rdx |
222 | cmpq %r11, %rdx |
223 | jae L(zero) |
224 | # ifdef USE_AS_WCSCMP |
225 | xorl %eax, %eax |
226 | movl (%rdi, %rdx), %ecx |
227 | cmpl (%rsi, %rdx), %ecx |
228 | jne L(wcscmp_return) |
229 | # else |
230 | movzbl (%rdi, %rdx), %eax |
231 | movzbl (%rsi, %rdx), %edx |
232 | subl %edx, %eax |
233 | # endif |
234 | # else |
235 | # ifdef USE_AS_WCSCMP |
236 | xorl %eax, %eax |
237 | movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx |
238 | cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx |
239 | jne L(wcscmp_return) |
240 | # else |
241 | movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax |
242 | movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx |
243 | subl %edx, %eax |
244 | # endif |
245 | # endif |
246 | ret |
247 | |
248 | L(return_3_vec_size): |
249 | tzcntl %ecx, %edx |
250 | # ifdef USE_AS_WCSCMP |
251 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
252 | sall $2, %edx |
253 | # endif |
254 | # ifdef USE_AS_STRNCMP |
255 | /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is |
256 | after the maximum offset (%r11). */ |
257 | addq $(VEC_SIZE * 3), %rdx |
258 | cmpq %r11, %rdx |
259 | jae L(zero) |
260 | # ifdef USE_AS_WCSCMP |
261 | xorl %eax, %eax |
262 | movl (%rdi, %rdx), %ecx |
263 | cmpl (%rsi, %rdx), %ecx |
264 | jne L(wcscmp_return) |
265 | # else |
266 | movzbl (%rdi, %rdx), %eax |
267 | movzbl (%rsi, %rdx), %edx |
268 | subl %edx, %eax |
269 | # endif |
270 | # else |
271 | # ifdef USE_AS_WCSCMP |
272 | xorl %eax, %eax |
273 | movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx |
274 | cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx |
275 | jne L(wcscmp_return) |
276 | # else |
277 | movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax |
278 | movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx |
279 | subl %edx, %eax |
280 | # endif |
281 | # endif |
282 | ret |
283 | |
284 | .p2align 4 |
285 | L(next_3_vectors): |
286 | VMOVU VEC_SIZE(%rdi), %YMM0 |
287 | /* Each bit set in K2 represents a non-null CHAR in YMM0. */ |
288 | VPTESTM %YMM0, %YMM0, %k2 |
289 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
290 | in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ |
291 | VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} |
292 | kmovd %k1, %ecx |
293 | # ifdef USE_AS_WCSCMP |
294 | subl $0xff, %ecx |
295 | # else |
296 | incl %ecx |
297 | # endif |
298 | jne L(return_vec_size) |
299 | |
300 | VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 |
301 | /* Each bit set in K2 represents a non-null CHAR in YMM0. */ |
302 | VPTESTM %YMM0, %YMM0, %k2 |
303 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
304 | in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ |
305 | VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} |
306 | kmovd %k1, %ecx |
307 | # ifdef USE_AS_WCSCMP |
308 | subl $0xff, %ecx |
309 | # else |
310 | incl %ecx |
311 | # endif |
312 | jne L(return_2_vec_size) |
313 | |
314 | VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 |
315 | /* Each bit set in K2 represents a non-null CHAR in YMM0. */ |
316 | VPTESTM %YMM0, %YMM0, %k2 |
317 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
318 | in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ |
319 | VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} |
320 | kmovd %k1, %ecx |
321 | # ifdef USE_AS_WCSCMP |
322 | subl $0xff, %ecx |
323 | # else |
324 | incl %ecx |
325 | # endif |
326 | jne L(return_3_vec_size) |
327 | L(main_loop_header): |
328 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
329 | movl $PAGE_SIZE, %ecx |
330 | /* Align load via RAX. */ |
331 | andq $-(VEC_SIZE * 4), %rdx |
332 | subq %rdi, %rdx |
333 | leaq (%rdi, %rdx), %rax |
334 | # ifdef USE_AS_STRNCMP |
335 | /* Starting from this point, the maximum offset, or simply the |
336 | 'offset', DECREASES by the same amount when base pointers are |
337 | moved forward. Return 0 when: |
338 | 1) On match: offset <= the matched vector index. |
339 | 2) On mistmach, offset is before the mistmatched index. |
340 | */ |
341 | subq %rdx, %r11 |
342 | jbe L(zero) |
343 | # endif |
344 | addq %rsi, %rdx |
345 | movq %rdx, %rsi |
346 | andl $(PAGE_SIZE - 1), %esi |
347 | /* Number of bytes before page crossing. */ |
348 | subq %rsi, %rcx |
349 | /* Number of VEC_SIZE * 4 blocks before page crossing. */ |
350 | shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx |
351 | /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ |
352 | movl %ecx, %esi |
353 | jmp L(loop_start) |
354 | |
355 | .p2align 4 |
356 | L(loop): |
357 | # ifdef USE_AS_STRNCMP |
358 | /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease |
359 | the maximum offset (%r11) by the same amount. */ |
360 | subq $(VEC_SIZE * 4), %r11 |
361 | jbe L(zero) |
362 | # endif |
363 | addq $(VEC_SIZE * 4), %rax |
364 | addq $(VEC_SIZE * 4), %rdx |
365 | L(loop_start): |
366 | testl %esi, %esi |
367 | leal -1(%esi), %esi |
368 | je L(loop_cross_page) |
369 | L(back_to_loop): |
370 | /* Main loop, comparing 4 vectors are a time. */ |
371 | VMOVA (%rax), %YMM0 |
372 | VMOVA VEC_SIZE(%rax), %YMM2 |
373 | VMOVA (VEC_SIZE * 2)(%rax), %YMM4 |
374 | VMOVA (VEC_SIZE * 3)(%rax), %YMM6 |
375 | |
376 | VPMINU %YMM0, %YMM2, %YMM8 |
377 | VPMINU %YMM4, %YMM6, %YMM9 |
378 | |
379 | /* A zero CHAR in YMM8 means that there is a null CHAR. */ |
380 | VPMINU %YMM8, %YMM9, %YMM8 |
381 | |
382 | /* Each bit set in K1 represents a non-null CHAR in YMM8. */ |
383 | VPTESTM %YMM8, %YMM8, %k1 |
384 | |
385 | /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ |
386 | vpxorq (%rdx), %YMM0, %YMM1 |
387 | vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 |
388 | vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 |
389 | vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 |
390 | |
391 | vporq %YMM1, %YMM3, %YMM9 |
392 | vporq %YMM5, %YMM7, %YMM10 |
393 | |
394 | /* A non-zero CHAR in YMM9 represents a mismatch. */ |
395 | vporq %YMM9, %YMM10, %YMM9 |
396 | |
397 | /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ |
398 | VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} |
399 | kmovd %k0, %ecx |
400 | # ifdef USE_AS_WCSCMP |
401 | subl $0xff, %ecx |
402 | # else |
403 | incl %ecx |
404 | # endif |
405 | je L(loop) |
406 | |
407 | /* Each bit set in K1 represents a non-null CHAR in YMM0. */ |
408 | VPTESTM %YMM0, %YMM0, %k1 |
409 | /* Each bit cleared in K0 represents a mismatch or a null CHAR |
410 | in YMM0 and (%rdx). */ |
411 | VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} |
412 | kmovd %k0, %ecx |
413 | # ifdef USE_AS_WCSCMP |
414 | subl $0xff, %ecx |
415 | # else |
416 | incl %ecx |
417 | # endif |
418 | je L(test_vec) |
419 | tzcntl %ecx, %ecx |
420 | # ifdef USE_AS_WCSCMP |
421 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
422 | sall $2, %ecx |
423 | # endif |
424 | # ifdef USE_AS_STRNCMP |
425 | cmpq %rcx, %r11 |
426 | jbe L(zero) |
427 | # ifdef USE_AS_WCSCMP |
428 | movq %rax, %rsi |
429 | xorl %eax, %eax |
430 | movl (%rsi, %rcx), %edi |
431 | cmpl (%rdx, %rcx), %edi |
432 | jne L(wcscmp_return) |
433 | # else |
434 | movzbl (%rax, %rcx), %eax |
435 | movzbl (%rdx, %rcx), %edx |
436 | subl %edx, %eax |
437 | # endif |
438 | # else |
439 | # ifdef USE_AS_WCSCMP |
440 | movq %rax, %rsi |
441 | xorl %eax, %eax |
442 | movl (%rsi, %rcx), %edi |
443 | cmpl (%rdx, %rcx), %edi |
444 | jne L(wcscmp_return) |
445 | # else |
446 | movzbl (%rax, %rcx), %eax |
447 | movzbl (%rdx, %rcx), %edx |
448 | subl %edx, %eax |
449 | # endif |
450 | # endif |
451 | ret |
452 | |
453 | .p2align 4 |
454 | L(test_vec): |
455 | # ifdef USE_AS_STRNCMP |
456 | /* The first vector matched. Return 0 if the maximum offset |
457 | (%r11) <= VEC_SIZE. */ |
458 | cmpq $VEC_SIZE, %r11 |
459 | jbe L(zero) |
460 | # endif |
461 | /* Each bit set in K1 represents a non-null CHAR in YMM2. */ |
462 | VPTESTM %YMM2, %YMM2, %k1 |
463 | /* Each bit cleared in K0 represents a mismatch or a null CHAR |
464 | in YMM2 and VEC_SIZE(%rdx). */ |
465 | VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} |
466 | kmovd %k0, %ecx |
467 | # ifdef USE_AS_WCSCMP |
468 | subl $0xff, %ecx |
469 | # else |
470 | incl %ecx |
471 | # endif |
472 | je L(test_2_vec) |
473 | tzcntl %ecx, %edi |
474 | # ifdef USE_AS_WCSCMP |
475 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
476 | sall $2, %edi |
477 | # endif |
478 | # ifdef USE_AS_STRNCMP |
479 | addq $VEC_SIZE, %rdi |
480 | cmpq %rdi, %r11 |
481 | jbe L(zero) |
482 | # ifdef USE_AS_WCSCMP |
483 | movq %rax, %rsi |
484 | xorl %eax, %eax |
485 | movl (%rsi, %rdi), %ecx |
486 | cmpl (%rdx, %rdi), %ecx |
487 | jne L(wcscmp_return) |
488 | # else |
489 | movzbl (%rax, %rdi), %eax |
490 | movzbl (%rdx, %rdi), %edx |
491 | subl %edx, %eax |
492 | # endif |
493 | # else |
494 | # ifdef USE_AS_WCSCMP |
495 | movq %rax, %rsi |
496 | xorl %eax, %eax |
497 | movl VEC_SIZE(%rsi, %rdi), %ecx |
498 | cmpl VEC_SIZE(%rdx, %rdi), %ecx |
499 | jne L(wcscmp_return) |
500 | # else |
501 | movzbl VEC_SIZE(%rax, %rdi), %eax |
502 | movzbl VEC_SIZE(%rdx, %rdi), %edx |
503 | subl %edx, %eax |
504 | # endif |
505 | # endif |
506 | ret |
507 | |
508 | .p2align 4 |
509 | L(test_2_vec): |
510 | # ifdef USE_AS_STRNCMP |
511 | /* The first 2 vectors matched. Return 0 if the maximum offset |
512 | (%r11) <= 2 * VEC_SIZE. */ |
513 | cmpq $(VEC_SIZE * 2), %r11 |
514 | jbe L(zero) |
515 | # endif |
516 | /* Each bit set in K1 represents a non-null CHAR in YMM4. */ |
517 | VPTESTM %YMM4, %YMM4, %k1 |
518 | /* Each bit cleared in K0 represents a mismatch or a null CHAR |
519 | in YMM4 and (VEC_SIZE * 2)(%rdx). */ |
520 | VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} |
521 | kmovd %k0, %ecx |
522 | # ifdef USE_AS_WCSCMP |
523 | subl $0xff, %ecx |
524 | # else |
525 | incl %ecx |
526 | # endif |
527 | je L(test_3_vec) |
528 | tzcntl %ecx, %edi |
529 | # ifdef USE_AS_WCSCMP |
530 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
531 | sall $2, %edi |
532 | # endif |
533 | # ifdef USE_AS_STRNCMP |
534 | addq $(VEC_SIZE * 2), %rdi |
535 | cmpq %rdi, %r11 |
536 | jbe L(zero) |
537 | # ifdef USE_AS_WCSCMP |
538 | movq %rax, %rsi |
539 | xorl %eax, %eax |
540 | movl (%rsi, %rdi), %ecx |
541 | cmpl (%rdx, %rdi), %ecx |
542 | jne L(wcscmp_return) |
543 | # else |
544 | movzbl (%rax, %rdi), %eax |
545 | movzbl (%rdx, %rdi), %edx |
546 | subl %edx, %eax |
547 | # endif |
548 | # else |
549 | # ifdef USE_AS_WCSCMP |
550 | movq %rax, %rsi |
551 | xorl %eax, %eax |
552 | movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx |
553 | cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx |
554 | jne L(wcscmp_return) |
555 | # else |
556 | movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax |
557 | movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx |
558 | subl %edx, %eax |
559 | # endif |
560 | # endif |
561 | ret |
562 | |
563 | .p2align 4 |
564 | L(test_3_vec): |
565 | # ifdef USE_AS_STRNCMP |
566 | /* The first 3 vectors matched. Return 0 if the maximum offset |
567 | (%r11) <= 3 * VEC_SIZE. */ |
568 | cmpq $(VEC_SIZE * 3), %r11 |
569 | jbe L(zero) |
570 | # endif |
571 | /* Each bit set in K1 represents a non-null CHAR in YMM6. */ |
572 | VPTESTM %YMM6, %YMM6, %k1 |
573 | /* Each bit cleared in K0 represents a mismatch or a null CHAR |
574 | in YMM6 and (VEC_SIZE * 3)(%rdx). */ |
575 | VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} |
576 | kmovd %k0, %ecx |
577 | # ifdef USE_AS_WCSCMP |
578 | subl $0xff, %ecx |
579 | # else |
580 | incl %ecx |
581 | # endif |
582 | tzcntl %ecx, %ecx |
583 | # ifdef USE_AS_WCSCMP |
584 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
585 | sall $2, %ecx |
586 | # endif |
587 | # ifdef USE_AS_STRNCMP |
588 | addq $(VEC_SIZE * 3), %rcx |
589 | cmpq %rcx, %r11 |
590 | jbe L(zero) |
591 | # ifdef USE_AS_WCSCMP |
592 | movq %rax, %rsi |
593 | xorl %eax, %eax |
594 | movl (%rsi, %rcx), %esi |
595 | cmpl (%rdx, %rcx), %esi |
596 | jne L(wcscmp_return) |
597 | # else |
598 | movzbl (%rax, %rcx), %eax |
599 | movzbl (%rdx, %rcx), %edx |
600 | subl %edx, %eax |
601 | # endif |
602 | # else |
603 | # ifdef USE_AS_WCSCMP |
604 | movq %rax, %rsi |
605 | xorl %eax, %eax |
606 | movl (VEC_SIZE * 3)(%rsi, %rcx), %esi |
607 | cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi |
608 | jne L(wcscmp_return) |
609 | # else |
610 | movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax |
611 | movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx |
612 | subl %edx, %eax |
613 | # endif |
614 | # endif |
615 | ret |
616 | |
617 | .p2align 4 |
618 | L(loop_cross_page): |
619 | xorl %r10d, %r10d |
620 | movq %rdx, %rcx |
621 | /* Align load via RDX. We load the extra ECX bytes which should |
622 | be ignored. */ |
623 | andl $((VEC_SIZE * 4) - 1), %ecx |
624 | /* R10 is -RCX. */ |
625 | subq %rcx, %r10 |
626 | |
627 | /* This works only if VEC_SIZE * 2 == 64. */ |
628 | # if (VEC_SIZE * 2) != 64 |
629 | # error (VEC_SIZE * 2) != 64 |
630 | # endif |
631 | |
632 | /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ |
633 | cmpl $(VEC_SIZE * 2), %ecx |
634 | jge L(loop_cross_page_2_vec) |
635 | |
636 | VMOVU (%rax, %r10), %YMM2 |
637 | VMOVU VEC_SIZE(%rax, %r10), %YMM3 |
638 | |
639 | /* Each bit set in K2 represents a non-null CHAR in YMM2. */ |
640 | VPTESTM %YMM2, %YMM2, %k2 |
641 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
642 | in YMM2 and 32 bytes at (%rdx, %r10). */ |
643 | VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} |
644 | kmovd %k1, %r9d |
645 | /* Don't use subl since it is the lower 16/32 bits of RDI |
646 | below. */ |
647 | notl %r9d |
648 | # ifdef USE_AS_WCSCMP |
649 | /* Only last 8 bits are valid. */ |
650 | andl $0xff, %r9d |
651 | # endif |
652 | |
653 | /* Each bit set in K4 represents a non-null CHAR in YMM3. */ |
654 | VPTESTM %YMM3, %YMM3, %k4 |
655 | /* Each bit cleared in K3 represents a mismatch or a null CHAR |
656 | in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ |
657 | VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} |
658 | kmovd %k3, %edi |
659 | /* Must use notl %edi here as lower bits are for CHAR |
660 | comparisons potentially out of range thus can be 0 without |
661 | indicating mismatch. */ |
662 | notl %edi |
663 | # ifdef USE_AS_WCSCMP |
664 | /* Don't use subl since it is the upper 8 bits of EDI below. */ |
665 | andl $0xff, %edi |
666 | # endif |
667 | |
668 | # ifdef USE_AS_WCSCMP |
669 | /* NB: Each bit in EDI/R9D represents 4-byte element. */ |
670 | sall $8, %edi |
671 | /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
672 | bytes. */ |
673 | movl %ecx, %SHIFT_REG32 |
674 | sarl $2, %SHIFT_REG32 |
675 | |
676 | /* Each bit in EDI represents a null CHAR or a mismatch. */ |
677 | orl %r9d, %edi |
678 | # else |
679 | salq $32, %rdi |
680 | |
681 | /* Each bit in RDI represents a null CHAR or a mismatch. */ |
682 | orq %r9, %rdi |
683 | # endif |
684 | |
685 | /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ |
686 | shrxq %SHIFT_REG64, %rdi, %rdi |
687 | testq %rdi, %rdi |
688 | je L(loop_cross_page_2_vec) |
689 | tzcntq %rdi, %rcx |
690 | # ifdef USE_AS_WCSCMP |
691 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
692 | sall $2, %ecx |
693 | # endif |
694 | # ifdef USE_AS_STRNCMP |
695 | cmpq %rcx, %r11 |
696 | jbe L(zero) |
697 | # ifdef USE_AS_WCSCMP |
698 | movq %rax, %rsi |
699 | xorl %eax, %eax |
700 | movl (%rsi, %rcx), %edi |
701 | cmpl (%rdx, %rcx), %edi |
702 | jne L(wcscmp_return) |
703 | # else |
704 | movzbl (%rax, %rcx), %eax |
705 | movzbl (%rdx, %rcx), %edx |
706 | subl %edx, %eax |
707 | # endif |
708 | # else |
709 | # ifdef USE_AS_WCSCMP |
710 | movq %rax, %rsi |
711 | xorl %eax, %eax |
712 | movl (%rsi, %rcx), %edi |
713 | cmpl (%rdx, %rcx), %edi |
714 | jne L(wcscmp_return) |
715 | # else |
716 | movzbl (%rax, %rcx), %eax |
717 | movzbl (%rdx, %rcx), %edx |
718 | subl %edx, %eax |
719 | # endif |
720 | # endif |
721 | ret |
722 | |
723 | .p2align 4 |
724 | L(loop_cross_page_2_vec): |
725 | /* The first VEC_SIZE * 2 bytes match or are ignored. */ |
726 | VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 |
727 | VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 |
728 | |
729 | VPTESTM %YMM0, %YMM0, %k2 |
730 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
731 | in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ |
732 | VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} |
733 | kmovd %k1, %r9d |
734 | /* Don't use subl since it is the lower 16/32 bits of RDI |
735 | below. */ |
736 | notl %r9d |
737 | # ifdef USE_AS_WCSCMP |
738 | /* Only last 8 bits are valid. */ |
739 | andl $0xff, %r9d |
740 | # endif |
741 | |
742 | VPTESTM %YMM1, %YMM1, %k4 |
743 | /* Each bit cleared in K3 represents a mismatch or a null CHAR |
744 | in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ |
745 | VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} |
746 | kmovd %k3, %edi |
747 | /* Must use notl %edi here as lower bits are for CHAR |
748 | comparisons potentially out of range thus can be 0 without |
749 | indicating mismatch. */ |
750 | notl %edi |
751 | # ifdef USE_AS_WCSCMP |
752 | /* Don't use subl since it is the upper 8 bits of EDI below. */ |
753 | andl $0xff, %edi |
754 | # endif |
755 | |
756 | # ifdef USE_AS_WCSCMP |
757 | /* NB: Each bit in EDI/R9D represents 4-byte element. */ |
758 | sall $8, %edi |
759 | |
760 | /* Each bit in EDI represents a null CHAR or a mismatch. */ |
761 | orl %r9d, %edi |
762 | # else |
763 | salq $32, %rdi |
764 | |
765 | /* Each bit in RDI represents a null CHAR or a mismatch. */ |
766 | orq %r9, %rdi |
767 | # endif |
768 | |
769 | xorl %r8d, %r8d |
770 | /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ |
771 | subl $(VEC_SIZE * 2), %ecx |
772 | jle 1f |
773 | /* R8 has number of bytes skipped. */ |
774 | movl %ecx, %r8d |
775 | # ifdef USE_AS_WCSCMP |
776 | /* NB: Divide shift count by 4 since each bit in RDI represent 4 |
777 | bytes. */ |
778 | sarl $2, %ecx |
779 | /* Skip ECX bytes. */ |
780 | shrl %cl, %edi |
781 | # else |
782 | /* Skip ECX bytes. */ |
783 | shrq %cl, %rdi |
784 | # endif |
785 | 1: |
786 | /* Before jumping back to the loop, set ESI to the number of |
787 | VEC_SIZE * 4 blocks before page crossing. */ |
788 | movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi |
789 | |
790 | testq %rdi, %rdi |
791 | # ifdef USE_AS_STRNCMP |
792 | /* At this point, if %rdi value is 0, it already tested |
793 | VEC_SIZE*4+%r10 byte starting from %rax. This label |
794 | checks whether strncmp maximum offset reached or not. */ |
795 | je L(string_nbyte_offset_check) |
796 | # else |
797 | je L(back_to_loop) |
798 | # endif |
799 | tzcntq %rdi, %rcx |
800 | # ifdef USE_AS_WCSCMP |
801 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
802 | sall $2, %ecx |
803 | # endif |
804 | addq %r10, %rcx |
805 | /* Adjust for number of bytes skipped. */ |
806 | addq %r8, %rcx |
807 | # ifdef USE_AS_STRNCMP |
808 | addq $(VEC_SIZE * 2), %rcx |
809 | subq %rcx, %r11 |
810 | jbe L(zero) |
811 | # ifdef USE_AS_WCSCMP |
812 | movq %rax, %rsi |
813 | xorl %eax, %eax |
814 | movl (%rsi, %rcx), %edi |
815 | cmpl (%rdx, %rcx), %edi |
816 | jne L(wcscmp_return) |
817 | # else |
818 | movzbl (%rax, %rcx), %eax |
819 | movzbl (%rdx, %rcx), %edx |
820 | subl %edx, %eax |
821 | # endif |
822 | # else |
823 | # ifdef USE_AS_WCSCMP |
824 | movq %rax, %rsi |
825 | xorl %eax, %eax |
826 | movl (VEC_SIZE * 2)(%rsi, %rcx), %edi |
827 | cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi |
828 | jne L(wcscmp_return) |
829 | # else |
830 | movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax |
831 | movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx |
832 | subl %edx, %eax |
833 | # endif |
834 | # endif |
835 | ret |
836 | |
837 | # ifdef USE_AS_STRNCMP |
838 | L(string_nbyte_offset_check): |
839 | leaq (VEC_SIZE * 4)(%r10), %r10 |
840 | cmpq %r10, %r11 |
841 | jbe L(zero) |
842 | jmp L(back_to_loop) |
843 | # endif |
844 | |
845 | .p2align 4 |
846 | L(cross_page_loop): |
847 | /* Check one byte/dword at a time. */ |
848 | # ifdef USE_AS_WCSCMP |
849 | cmpl %ecx, %eax |
850 | # else |
851 | subl %ecx, %eax |
852 | # endif |
853 | jne L(different) |
854 | addl $SIZE_OF_CHAR, %edx |
855 | cmpl $(VEC_SIZE * 4), %edx |
856 | je L(main_loop_header) |
857 | # ifdef USE_AS_STRNCMP |
858 | cmpq %r11, %rdx |
859 | jae L(zero) |
860 | # endif |
861 | # ifdef USE_AS_WCSCMP |
862 | movl (%rdi, %rdx), %eax |
863 | movl (%rsi, %rdx), %ecx |
864 | # else |
865 | movzbl (%rdi, %rdx), %eax |
866 | movzbl (%rsi, %rdx), %ecx |
867 | # endif |
868 | /* Check null CHAR. */ |
869 | testl %eax, %eax |
870 | jne L(cross_page_loop) |
871 | /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED |
872 | comparisons. */ |
873 | subl %ecx, %eax |
874 | # ifndef USE_AS_WCSCMP |
875 | L(different): |
876 | # endif |
877 | ret |
878 | |
879 | # ifdef USE_AS_WCSCMP |
880 | .p2align 4 |
881 | L(different): |
882 | /* Use movl to avoid modifying EFLAGS. */ |
883 | movl $0, %eax |
884 | setl %al |
885 | negl %eax |
886 | orl $1, %eax |
887 | ret |
888 | # endif |
889 | |
890 | # ifdef USE_AS_STRNCMP |
891 | .p2align 4 |
892 | L(zero): |
893 | xorl %eax, %eax |
894 | ret |
895 | |
896 | .p2align 4 |
897 | L(char0): |
898 | # ifdef USE_AS_WCSCMP |
899 | xorl %eax, %eax |
900 | movl (%rdi), %ecx |
901 | cmpl (%rsi), %ecx |
902 | jne L(wcscmp_return) |
903 | # else |
904 | movzbl (%rsi), %ecx |
905 | movzbl (%rdi), %eax |
906 | subl %ecx, %eax |
907 | # endif |
908 | ret |
909 | # endif |
910 | |
911 | .p2align 4 |
912 | L(last_vector): |
913 | addq %rdx, %rdi |
914 | addq %rdx, %rsi |
915 | # ifdef USE_AS_STRNCMP |
916 | subq %rdx, %r11 |
917 | # endif |
918 | tzcntl %ecx, %edx |
919 | # ifdef USE_AS_WCSCMP |
920 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
921 | sall $2, %edx |
922 | # endif |
923 | # ifdef USE_AS_STRNCMP |
924 | cmpq %r11, %rdx |
925 | jae L(zero) |
926 | # endif |
927 | # ifdef USE_AS_WCSCMP |
928 | xorl %eax, %eax |
929 | movl (%rdi, %rdx), %ecx |
930 | cmpl (%rsi, %rdx), %ecx |
931 | jne L(wcscmp_return) |
932 | # else |
933 | movzbl (%rdi, %rdx), %eax |
934 | movzbl (%rsi, %rdx), %edx |
935 | subl %edx, %eax |
936 | # endif |
937 | ret |
938 | |
939 | /* Comparing on page boundary region requires special treatment: |
940 | It must done one vector at the time, starting with the wider |
941 | ymm vector if possible, if not, with xmm. If fetching 16 bytes |
942 | (xmm) still passes the boundary, byte comparison must be done. |
943 | */ |
944 | .p2align 4 |
945 | L(cross_page): |
946 | /* Try one ymm vector at a time. */ |
947 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
948 | jg L(cross_page_1_vector) |
949 | L(loop_1_vector): |
950 | VMOVU (%rdi, %rdx), %YMM0 |
951 | |
952 | VPTESTM %YMM0, %YMM0, %k2 |
953 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
954 | in YMM0 and 32 bytes at (%rsi, %rdx). */ |
955 | VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} |
956 | kmovd %k1, %ecx |
957 | # ifdef USE_AS_WCSCMP |
958 | subl $0xff, %ecx |
959 | # else |
960 | incl %ecx |
961 | # endif |
962 | jne L(last_vector) |
963 | |
964 | addl $VEC_SIZE, %edx |
965 | |
966 | addl $VEC_SIZE, %eax |
967 | # ifdef USE_AS_STRNCMP |
968 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
969 | (%r11). */ |
970 | cmpq %r11, %rdx |
971 | jae L(zero) |
972 | # endif |
973 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
974 | jle L(loop_1_vector) |
975 | L(cross_page_1_vector): |
976 | /* Less than 32 bytes to check, try one xmm vector. */ |
977 | cmpl $(PAGE_SIZE - 16), %eax |
978 | jg L(cross_page_1_xmm) |
979 | VMOVU (%rdi, %rdx), %XMM0 |
980 | |
981 | VPTESTM %YMM0, %YMM0, %k2 |
982 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
983 | in XMM0 and 16 bytes at (%rsi, %rdx). */ |
984 | VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} |
985 | kmovd %k1, %ecx |
986 | # ifdef USE_AS_WCSCMP |
987 | subl $0xf, %ecx |
988 | # else |
989 | subl $0xffff, %ecx |
990 | # endif |
991 | jne L(last_vector) |
992 | |
993 | addl $16, %edx |
994 | # ifndef USE_AS_WCSCMP |
995 | addl $16, %eax |
996 | # endif |
997 | # ifdef USE_AS_STRNCMP |
998 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
999 | (%r11). */ |
1000 | cmpq %r11, %rdx |
1001 | jae L(zero) |
1002 | # endif |
1003 | |
1004 | L(cross_page_1_xmm): |
1005 | # ifndef USE_AS_WCSCMP |
1006 | /* Less than 16 bytes to check, try 8 byte vector. NB: No need |
1007 | for wcscmp nor wcsncmp since wide char is 4 bytes. */ |
1008 | cmpl $(PAGE_SIZE - 8), %eax |
1009 | jg L(cross_page_8bytes) |
1010 | vmovq (%rdi, %rdx), %XMM0 |
1011 | vmovq (%rsi, %rdx), %XMM1 |
1012 | |
1013 | VPTESTM %YMM0, %YMM0, %k2 |
1014 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
1015 | in XMM0 and XMM1. */ |
1016 | VPCMP $0, %XMM1, %XMM0, %k1{%k2} |
1017 | kmovb %k1, %ecx |
1018 | # ifdef USE_AS_WCSCMP |
1019 | subl $0x3, %ecx |
1020 | # else |
1021 | subl $0xff, %ecx |
1022 | # endif |
1023 | jne L(last_vector) |
1024 | |
1025 | addl $8, %edx |
1026 | addl $8, %eax |
1027 | # ifdef USE_AS_STRNCMP |
1028 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
1029 | (%r11). */ |
1030 | cmpq %r11, %rdx |
1031 | jae L(zero) |
1032 | # endif |
1033 | |
1034 | L(cross_page_8bytes): |
1035 | /* Less than 8 bytes to check, try 4 byte vector. */ |
1036 | cmpl $(PAGE_SIZE - 4), %eax |
1037 | jg L(cross_page_4bytes) |
1038 | vmovd (%rdi, %rdx), %XMM0 |
1039 | vmovd (%rsi, %rdx), %XMM1 |
1040 | |
1041 | VPTESTM %YMM0, %YMM0, %k2 |
1042 | /* Each bit cleared in K1 represents a mismatch or a null CHAR |
1043 | in XMM0 and XMM1. */ |
1044 | VPCMP $0, %XMM1, %XMM0, %k1{%k2} |
1045 | kmovd %k1, %ecx |
1046 | # ifdef USE_AS_WCSCMP |
1047 | subl $0x1, %ecx |
1048 | # else |
1049 | subl $0xf, %ecx |
1050 | # endif |
1051 | jne L(last_vector) |
1052 | |
1053 | addl $4, %edx |
1054 | # ifdef USE_AS_STRNCMP |
1055 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
1056 | (%r11). */ |
1057 | cmpq %r11, %rdx |
1058 | jae L(zero) |
1059 | # endif |
1060 | |
1061 | L(cross_page_4bytes): |
1062 | # endif |
1063 | /* Less than 4 bytes to check, try one byte/dword at a time. */ |
1064 | # ifdef USE_AS_STRNCMP |
1065 | cmpq %r11, %rdx |
1066 | jae L(zero) |
1067 | # endif |
1068 | # ifdef USE_AS_WCSCMP |
1069 | movl (%rdi, %rdx), %eax |
1070 | movl (%rsi, %rdx), %ecx |
1071 | # else |
1072 | movzbl (%rdi, %rdx), %eax |
1073 | movzbl (%rsi, %rdx), %ecx |
1074 | # endif |
1075 | testl %eax, %eax |
1076 | jne L(cross_page_loop) |
1077 | subl %ecx, %eax |
1078 | ret |
1079 | END (STRCMP) |
1080 | #endif |
1081 | |