1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRCMP |
24 | # define STRCMP __strcmp_evex |
25 | # endif |
26 | |
27 | # define PAGE_SIZE 4096 |
28 | |
29 | /* VEC_SIZE = Number of bytes in a ymm register */ |
30 | # define VEC_SIZE 32 |
31 | |
32 | /* Shift for dividing by (VEC_SIZE * 4). */ |
33 | # define DIVIDE_BY_VEC_4_SHIFT 7 |
34 | # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
35 | # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
36 | # endif |
37 | |
38 | # define VMOVU vmovdqu64 |
39 | # define VMOVA vmovdqa64 |
40 | |
41 | # ifdef USE_AS_WCSCMP |
42 | /* Compare packed dwords. */ |
43 | # define VPCMP vpcmpd |
44 | # define SHIFT_REG32 r8d |
45 | # define SHIFT_REG64 r8 |
46 | /* 1 dword char == 4 bytes. */ |
47 | # define SIZE_OF_CHAR 4 |
48 | # else |
49 | /* Compare packed bytes. */ |
50 | # define VPCMP vpcmpb |
51 | # define SHIFT_REG32 ecx |
52 | # define SHIFT_REG64 rcx |
53 | /* 1 byte char == 1 byte. */ |
54 | # define SIZE_OF_CHAR 1 |
55 | # endif |
56 | |
57 | # define XMMZERO xmm16 |
58 | # define XMM0 xmm17 |
59 | # define XMM1 xmm18 |
60 | |
61 | # define YMMZERO ymm16 |
62 | # define YMM0 ymm17 |
63 | # define YMM1 ymm18 |
64 | # define YMM2 ymm19 |
65 | # define YMM3 ymm20 |
66 | # define YMM4 ymm21 |
67 | # define YMM5 ymm22 |
68 | # define YMM6 ymm23 |
69 | # define YMM7 ymm24 |
70 | |
71 | /* Warning! |
72 | wcscmp/wcsncmp have to use SIGNED comparison for elements. |
73 | strcmp/strncmp have to use UNSIGNED comparison for elements. |
74 | */ |
75 | |
76 | /* The main idea of the string comparison (byte or dword) using 256-bit |
77 | EVEX instructions consists of comparing (VPCMP) two ymm vectors. The |
78 | latter can be on either packed bytes or dwords depending on |
79 | USE_AS_WCSCMP. In order to check the null char, algorithm keeps the |
80 | matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 |
81 | KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) |
82 | are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd |
83 | instructions. Main loop (away from from page boundary) compares 4 |
84 | vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 |
85 | bytes) on each loop. |
86 | |
87 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
88 | is the same as strcmp, except that an a maximum offset is tracked. If |
89 | the maximum offset is reached before a difference is found, zero is |
90 | returned. */ |
91 | |
92 | .section .text.evex,"ax" ,@progbits |
93 | ENTRY (STRCMP) |
94 | # ifdef USE_AS_STRNCMP |
95 | /* Check for simple cases (0 or 1) in offset. */ |
96 | cmp $1, %RDX_LP |
97 | je L(char0) |
98 | jb L(zero) |
99 | # ifdef USE_AS_WCSCMP |
100 | /* Convert units: from wide to byte char. */ |
101 | shl $2, %RDX_LP |
102 | # endif |
103 | /* Register %r11 tracks the maximum offset. */ |
104 | mov %RDX_LP, %R11_LP |
105 | # endif |
106 | movl %edi, %eax |
107 | xorl %edx, %edx |
108 | /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ |
109 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
110 | orl %esi, %eax |
111 | andl $(PAGE_SIZE - 1), %eax |
112 | cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax |
113 | jg L(cross_page) |
114 | /* Start comparing 4 vectors. */ |
115 | VMOVU (%rdi), %YMM0 |
116 | VMOVU (%rsi), %YMM1 |
117 | |
118 | /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ |
119 | VPCMP $4, %YMM0, %YMM1, %k0 |
120 | |
121 | /* Check for NULL in YMM0. */ |
122 | VPCMP $0, %YMMZERO, %YMM0, %k1 |
123 | /* Check for NULL in YMM1. */ |
124 | VPCMP $0, %YMMZERO, %YMM1, %k2 |
125 | /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ |
126 | kord %k1, %k2, %k1 |
127 | |
128 | /* Each bit in K1 represents: |
129 | 1. A mismatch in YMM0 and YMM1. Or |
130 | 2. A NULL in YMM0 or YMM1. |
131 | */ |
132 | kord %k0, %k1, %k1 |
133 | |
134 | ktestd %k1, %k1 |
135 | je L(next_3_vectors) |
136 | kmovd %k1, %ecx |
137 | tzcntl %ecx, %edx |
138 | # ifdef USE_AS_WCSCMP |
139 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
140 | sall $2, %edx |
141 | # endif |
142 | # ifdef USE_AS_STRNCMP |
143 | /* Return 0 if the mismatched index (%rdx) is after the maximum |
144 | offset (%r11). */ |
145 | cmpq %r11, %rdx |
146 | jae L(zero) |
147 | # endif |
148 | # ifdef USE_AS_WCSCMP |
149 | xorl %eax, %eax |
150 | movl (%rdi, %rdx), %ecx |
151 | cmpl (%rsi, %rdx), %ecx |
152 | je L(return) |
153 | L(wcscmp_return): |
154 | setl %al |
155 | negl %eax |
156 | orl $1, %eax |
157 | L(return): |
158 | # else |
159 | movzbl (%rdi, %rdx), %eax |
160 | movzbl (%rsi, %rdx), %edx |
161 | subl %edx, %eax |
162 | # endif |
163 | ret |
164 | |
165 | .p2align 4 |
166 | L(return_vec_size): |
167 | kmovd %k1, %ecx |
168 | tzcntl %ecx, %edx |
169 | # ifdef USE_AS_WCSCMP |
170 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
171 | sall $2, %edx |
172 | # endif |
173 | # ifdef USE_AS_STRNCMP |
174 | /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after |
175 | the maximum offset (%r11). */ |
176 | addq $VEC_SIZE, %rdx |
177 | cmpq %r11, %rdx |
178 | jae L(zero) |
179 | # ifdef USE_AS_WCSCMP |
180 | xorl %eax, %eax |
181 | movl (%rdi, %rdx), %ecx |
182 | cmpl (%rsi, %rdx), %ecx |
183 | jne L(wcscmp_return) |
184 | # else |
185 | movzbl (%rdi, %rdx), %eax |
186 | movzbl (%rsi, %rdx), %edx |
187 | subl %edx, %eax |
188 | # endif |
189 | # else |
190 | # ifdef USE_AS_WCSCMP |
191 | xorl %eax, %eax |
192 | movl VEC_SIZE(%rdi, %rdx), %ecx |
193 | cmpl VEC_SIZE(%rsi, %rdx), %ecx |
194 | jne L(wcscmp_return) |
195 | # else |
196 | movzbl VEC_SIZE(%rdi, %rdx), %eax |
197 | movzbl VEC_SIZE(%rsi, %rdx), %edx |
198 | subl %edx, %eax |
199 | # endif |
200 | # endif |
201 | ret |
202 | |
203 | .p2align 4 |
204 | L(return_2_vec_size): |
205 | kmovd %k1, %ecx |
206 | tzcntl %ecx, %edx |
207 | # ifdef USE_AS_WCSCMP |
208 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
209 | sall $2, %edx |
210 | # endif |
211 | # ifdef USE_AS_STRNCMP |
212 | /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is |
213 | after the maximum offset (%r11). */ |
214 | addq $(VEC_SIZE * 2), %rdx |
215 | cmpq %r11, %rdx |
216 | jae L(zero) |
217 | # ifdef USE_AS_WCSCMP |
218 | xorl %eax, %eax |
219 | movl (%rdi, %rdx), %ecx |
220 | cmpl (%rsi, %rdx), %ecx |
221 | jne L(wcscmp_return) |
222 | # else |
223 | movzbl (%rdi, %rdx), %eax |
224 | movzbl (%rsi, %rdx), %edx |
225 | subl %edx, %eax |
226 | # endif |
227 | # else |
228 | # ifdef USE_AS_WCSCMP |
229 | xorl %eax, %eax |
230 | movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx |
231 | cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx |
232 | jne L(wcscmp_return) |
233 | # else |
234 | movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax |
235 | movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx |
236 | subl %edx, %eax |
237 | # endif |
238 | # endif |
239 | ret |
240 | |
241 | .p2align 4 |
242 | L(return_3_vec_size): |
243 | kmovd %k1, %ecx |
244 | tzcntl %ecx, %edx |
245 | # ifdef USE_AS_WCSCMP |
246 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
247 | sall $2, %edx |
248 | # endif |
249 | # ifdef USE_AS_STRNCMP |
250 | /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is |
251 | after the maximum offset (%r11). */ |
252 | addq $(VEC_SIZE * 3), %rdx |
253 | cmpq %r11, %rdx |
254 | jae L(zero) |
255 | # ifdef USE_AS_WCSCMP |
256 | xorl %eax, %eax |
257 | movl (%rdi, %rdx), %ecx |
258 | cmpl (%rsi, %rdx), %ecx |
259 | jne L(wcscmp_return) |
260 | # else |
261 | movzbl (%rdi, %rdx), %eax |
262 | movzbl (%rsi, %rdx), %edx |
263 | subl %edx, %eax |
264 | # endif |
265 | # else |
266 | # ifdef USE_AS_WCSCMP |
267 | xorl %eax, %eax |
268 | movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx |
269 | cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx |
270 | jne L(wcscmp_return) |
271 | # else |
272 | movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax |
273 | movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx |
274 | subl %edx, %eax |
275 | # endif |
276 | # endif |
277 | ret |
278 | |
279 | .p2align 4 |
280 | L(next_3_vectors): |
281 | VMOVU VEC_SIZE(%rdi), %YMM0 |
282 | VMOVU VEC_SIZE(%rsi), %YMM1 |
283 | /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ |
284 | VPCMP $4, %YMM0, %YMM1, %k0 |
285 | VPCMP $0, %YMMZERO, %YMM0, %k1 |
286 | VPCMP $0, %YMMZERO, %YMM1, %k2 |
287 | /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ |
288 | kord %k1, %k2, %k1 |
289 | /* Each bit in K1 represents a NULL or a mismatch. */ |
290 | kord %k0, %k1, %k1 |
291 | ktestd %k1, %k1 |
292 | jne L(return_vec_size) |
293 | |
294 | VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 |
295 | VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 |
296 | VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 |
297 | VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 |
298 | |
299 | /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ |
300 | VPCMP $4, %YMM2, %YMM4, %k0 |
301 | VPCMP $0, %YMMZERO, %YMM2, %k1 |
302 | VPCMP $0, %YMMZERO, %YMM4, %k2 |
303 | /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ |
304 | kord %k1, %k2, %k1 |
305 | /* Each bit in K1 represents a NULL or a mismatch. */ |
306 | kord %k0, %k1, %k1 |
307 | ktestd %k1, %k1 |
308 | jne L(return_2_vec_size) |
309 | |
310 | /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ |
311 | VPCMP $4, %YMM3, %YMM5, %k0 |
312 | VPCMP $0, %YMMZERO, %YMM3, %k1 |
313 | VPCMP $0, %YMMZERO, %YMM5, %k2 |
314 | /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ |
315 | kord %k1, %k2, %k1 |
316 | /* Each bit in K1 represents a NULL or a mismatch. */ |
317 | kord %k0, %k1, %k1 |
318 | ktestd %k1, %k1 |
319 | jne L(return_3_vec_size) |
320 | L(main_loop_header): |
321 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
322 | movl $PAGE_SIZE, %ecx |
323 | /* Align load via RAX. */ |
324 | andq $-(VEC_SIZE * 4), %rdx |
325 | subq %rdi, %rdx |
326 | leaq (%rdi, %rdx), %rax |
327 | # ifdef USE_AS_STRNCMP |
328 | /* Starting from this point, the maximum offset, or simply the |
329 | 'offset', DECREASES by the same amount when base pointers are |
330 | moved forward. Return 0 when: |
331 | 1) On match: offset <= the matched vector index. |
332 | 2) On mistmach, offset is before the mistmatched index. |
333 | */ |
334 | subq %rdx, %r11 |
335 | jbe L(zero) |
336 | # endif |
337 | addq %rsi, %rdx |
338 | movq %rdx, %rsi |
339 | andl $(PAGE_SIZE - 1), %esi |
340 | /* Number of bytes before page crossing. */ |
341 | subq %rsi, %rcx |
342 | /* Number of VEC_SIZE * 4 blocks before page crossing. */ |
343 | shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx |
344 | /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ |
345 | movl %ecx, %esi |
346 | jmp L(loop_start) |
347 | |
348 | .p2align 4 |
349 | L(loop): |
350 | # ifdef USE_AS_STRNCMP |
351 | /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease |
352 | the maximum offset (%r11) by the same amount. */ |
353 | subq $(VEC_SIZE * 4), %r11 |
354 | jbe L(zero) |
355 | # endif |
356 | addq $(VEC_SIZE * 4), %rax |
357 | addq $(VEC_SIZE * 4), %rdx |
358 | L(loop_start): |
359 | testl %esi, %esi |
360 | leal -1(%esi), %esi |
361 | je L(loop_cross_page) |
362 | L(back_to_loop): |
363 | /* Main loop, comparing 4 vectors are a time. */ |
364 | VMOVA (%rax), %YMM0 |
365 | VMOVA VEC_SIZE(%rax), %YMM2 |
366 | VMOVA (VEC_SIZE * 2)(%rax), %YMM4 |
367 | VMOVA (VEC_SIZE * 3)(%rax), %YMM6 |
368 | VMOVU (%rdx), %YMM1 |
369 | VMOVU VEC_SIZE(%rdx), %YMM3 |
370 | VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 |
371 | VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 |
372 | |
373 | VPCMP $4, %YMM0, %YMM1, %k0 |
374 | VPCMP $0, %YMMZERO, %YMM0, %k1 |
375 | VPCMP $0, %YMMZERO, %YMM1, %k2 |
376 | kord %k1, %k2, %k1 |
377 | /* Each bit in K4 represents a NULL or a mismatch in YMM0 and |
378 | YMM1. */ |
379 | kord %k0, %k1, %k4 |
380 | |
381 | VPCMP $4, %YMM2, %YMM3, %k0 |
382 | VPCMP $0, %YMMZERO, %YMM2, %k1 |
383 | VPCMP $0, %YMMZERO, %YMM3, %k2 |
384 | kord %k1, %k2, %k1 |
385 | /* Each bit in K5 represents a NULL or a mismatch in YMM2 and |
386 | YMM3. */ |
387 | kord %k0, %k1, %k5 |
388 | |
389 | VPCMP $4, %YMM4, %YMM5, %k0 |
390 | VPCMP $0, %YMMZERO, %YMM4, %k1 |
391 | VPCMP $0, %YMMZERO, %YMM5, %k2 |
392 | kord %k1, %k2, %k1 |
393 | /* Each bit in K6 represents a NULL or a mismatch in YMM4 and |
394 | YMM5. */ |
395 | kord %k0, %k1, %k6 |
396 | |
397 | VPCMP $4, %YMM6, %YMM7, %k0 |
398 | VPCMP $0, %YMMZERO, %YMM6, %k1 |
399 | VPCMP $0, %YMMZERO, %YMM7, %k2 |
400 | kord %k1, %k2, %k1 |
401 | /* Each bit in K7 represents a NULL or a mismatch in YMM6 and |
402 | YMM7. */ |
403 | kord %k0, %k1, %k7 |
404 | |
405 | kord %k4, %k5, %k0 |
406 | kord %k6, %k7, %k1 |
407 | |
408 | /* Test each mask (32 bits) individually because for VEC_SIZE |
409 | == 32 is not possible to OR the four masks and keep all bits |
410 | in a 64-bit integer register, differing from SSE2 strcmp |
411 | where ORing is possible. */ |
412 | kortestd %k0, %k1 |
413 | je L(loop) |
414 | ktestd %k4, %k4 |
415 | je L(test_vec) |
416 | kmovd %k4, %edi |
417 | tzcntl %edi, %ecx |
418 | # ifdef USE_AS_WCSCMP |
419 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
420 | sall $2, %ecx |
421 | # endif |
422 | # ifdef USE_AS_STRNCMP |
423 | cmpq %rcx, %r11 |
424 | jbe L(zero) |
425 | # ifdef USE_AS_WCSCMP |
426 | movq %rax, %rsi |
427 | xorl %eax, %eax |
428 | movl (%rsi, %rcx), %edi |
429 | cmpl (%rdx, %rcx), %edi |
430 | jne L(wcscmp_return) |
431 | # else |
432 | movzbl (%rax, %rcx), %eax |
433 | movzbl (%rdx, %rcx), %edx |
434 | subl %edx, %eax |
435 | # endif |
436 | # else |
437 | # ifdef USE_AS_WCSCMP |
438 | movq %rax, %rsi |
439 | xorl %eax, %eax |
440 | movl (%rsi, %rcx), %edi |
441 | cmpl (%rdx, %rcx), %edi |
442 | jne L(wcscmp_return) |
443 | # else |
444 | movzbl (%rax, %rcx), %eax |
445 | movzbl (%rdx, %rcx), %edx |
446 | subl %edx, %eax |
447 | # endif |
448 | # endif |
449 | ret |
450 | |
451 | .p2align 4 |
452 | L(test_vec): |
453 | # ifdef USE_AS_STRNCMP |
454 | /* The first vector matched. Return 0 if the maximum offset |
455 | (%r11) <= VEC_SIZE. */ |
456 | cmpq $VEC_SIZE, %r11 |
457 | jbe L(zero) |
458 | # endif |
459 | ktestd %k5, %k5 |
460 | je L(test_2_vec) |
461 | kmovd %k5, %ecx |
462 | tzcntl %ecx, %edi |
463 | # ifdef USE_AS_WCSCMP |
464 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
465 | sall $2, %edi |
466 | # endif |
467 | # ifdef USE_AS_STRNCMP |
468 | addq $VEC_SIZE, %rdi |
469 | cmpq %rdi, %r11 |
470 | jbe L(zero) |
471 | # ifdef USE_AS_WCSCMP |
472 | movq %rax, %rsi |
473 | xorl %eax, %eax |
474 | movl (%rsi, %rdi), %ecx |
475 | cmpl (%rdx, %rdi), %ecx |
476 | jne L(wcscmp_return) |
477 | # else |
478 | movzbl (%rax, %rdi), %eax |
479 | movzbl (%rdx, %rdi), %edx |
480 | subl %edx, %eax |
481 | # endif |
482 | # else |
483 | # ifdef USE_AS_WCSCMP |
484 | movq %rax, %rsi |
485 | xorl %eax, %eax |
486 | movl VEC_SIZE(%rsi, %rdi), %ecx |
487 | cmpl VEC_SIZE(%rdx, %rdi), %ecx |
488 | jne L(wcscmp_return) |
489 | # else |
490 | movzbl VEC_SIZE(%rax, %rdi), %eax |
491 | movzbl VEC_SIZE(%rdx, %rdi), %edx |
492 | subl %edx, %eax |
493 | # endif |
494 | # endif |
495 | ret |
496 | |
497 | .p2align 4 |
498 | L(test_2_vec): |
499 | # ifdef USE_AS_STRNCMP |
500 | /* The first 2 vectors matched. Return 0 if the maximum offset |
501 | (%r11) <= 2 * VEC_SIZE. */ |
502 | cmpq $(VEC_SIZE * 2), %r11 |
503 | jbe L(zero) |
504 | # endif |
505 | ktestd %k6, %k6 |
506 | je L(test_3_vec) |
507 | kmovd %k6, %ecx |
508 | tzcntl %ecx, %edi |
509 | # ifdef USE_AS_WCSCMP |
510 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
511 | sall $2, %edi |
512 | # endif |
513 | # ifdef USE_AS_STRNCMP |
514 | addq $(VEC_SIZE * 2), %rdi |
515 | cmpq %rdi, %r11 |
516 | jbe L(zero) |
517 | # ifdef USE_AS_WCSCMP |
518 | movq %rax, %rsi |
519 | xorl %eax, %eax |
520 | movl (%rsi, %rdi), %ecx |
521 | cmpl (%rdx, %rdi), %ecx |
522 | jne L(wcscmp_return) |
523 | # else |
524 | movzbl (%rax, %rdi), %eax |
525 | movzbl (%rdx, %rdi), %edx |
526 | subl %edx, %eax |
527 | # endif |
528 | # else |
529 | # ifdef USE_AS_WCSCMP |
530 | movq %rax, %rsi |
531 | xorl %eax, %eax |
532 | movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx |
533 | cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx |
534 | jne L(wcscmp_return) |
535 | # else |
536 | movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax |
537 | movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx |
538 | subl %edx, %eax |
539 | # endif |
540 | # endif |
541 | ret |
542 | |
543 | .p2align 4 |
544 | L(test_3_vec): |
545 | # ifdef USE_AS_STRNCMP |
546 | /* The first 3 vectors matched. Return 0 if the maximum offset |
547 | (%r11) <= 3 * VEC_SIZE. */ |
548 | cmpq $(VEC_SIZE * 3), %r11 |
549 | jbe L(zero) |
550 | # endif |
551 | kmovd %k7, %esi |
552 | tzcntl %esi, %ecx |
553 | # ifdef USE_AS_WCSCMP |
554 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
555 | sall $2, %ecx |
556 | # endif |
557 | # ifdef USE_AS_STRNCMP |
558 | addq $(VEC_SIZE * 3), %rcx |
559 | cmpq %rcx, %r11 |
560 | jbe L(zero) |
561 | # ifdef USE_AS_WCSCMP |
562 | movq %rax, %rsi |
563 | xorl %eax, %eax |
564 | movl (%rsi, %rcx), %esi |
565 | cmpl (%rdx, %rcx), %esi |
566 | jne L(wcscmp_return) |
567 | # else |
568 | movzbl (%rax, %rcx), %eax |
569 | movzbl (%rdx, %rcx), %edx |
570 | subl %edx, %eax |
571 | # endif |
572 | # else |
573 | # ifdef USE_AS_WCSCMP |
574 | movq %rax, %rsi |
575 | xorl %eax, %eax |
576 | movl (VEC_SIZE * 3)(%rsi, %rcx), %esi |
577 | cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi |
578 | jne L(wcscmp_return) |
579 | # else |
580 | movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax |
581 | movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx |
582 | subl %edx, %eax |
583 | # endif |
584 | # endif |
585 | ret |
586 | |
587 | .p2align 4 |
588 | L(loop_cross_page): |
589 | xorl %r10d, %r10d |
590 | movq %rdx, %rcx |
591 | /* Align load via RDX. We load the extra ECX bytes which should |
592 | be ignored. */ |
593 | andl $((VEC_SIZE * 4) - 1), %ecx |
594 | /* R10 is -RCX. */ |
595 | subq %rcx, %r10 |
596 | |
597 | /* This works only if VEC_SIZE * 2 == 64. */ |
598 | # if (VEC_SIZE * 2) != 64 |
599 | # error (VEC_SIZE * 2) != 64 |
600 | # endif |
601 | |
602 | /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ |
603 | cmpl $(VEC_SIZE * 2), %ecx |
604 | jge L(loop_cross_page_2_vec) |
605 | |
606 | VMOVU (%rax, %r10), %YMM2 |
607 | VMOVU VEC_SIZE(%rax, %r10), %YMM3 |
608 | VMOVU (%rdx, %r10), %YMM4 |
609 | VMOVU VEC_SIZE(%rdx, %r10), %YMM5 |
610 | |
611 | VPCMP $4, %YMM4, %YMM2, %k0 |
612 | VPCMP $0, %YMMZERO, %YMM2, %k1 |
613 | VPCMP $0, %YMMZERO, %YMM4, %k2 |
614 | kord %k1, %k2, %k1 |
615 | /* Each bit in K1 represents a NULL or a mismatch in YMM2 and |
616 | YMM4. */ |
617 | kord %k0, %k1, %k1 |
618 | |
619 | VPCMP $4, %YMM5, %YMM3, %k3 |
620 | VPCMP $0, %YMMZERO, %YMM3, %k4 |
621 | VPCMP $0, %YMMZERO, %YMM5, %k5 |
622 | kord %k4, %k5, %k4 |
623 | /* Each bit in K3 represents a NULL or a mismatch in YMM3 and |
624 | YMM5. */ |
625 | kord %k3, %k4, %k3 |
626 | |
627 | # ifdef USE_AS_WCSCMP |
628 | /* NB: Each bit in K1/K3 represents 4-byte element. */ |
629 | kshiftlw $8, %k3, %k2 |
630 | /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
631 | bytes. */ |
632 | movl %ecx, %SHIFT_REG32 |
633 | sarl $2, %SHIFT_REG32 |
634 | # else |
635 | kshiftlq $32, %k3, %k2 |
636 | # endif |
637 | |
638 | /* Each bit in K1 represents a NULL or a mismatch. */ |
639 | korq %k1, %k2, %k1 |
640 | kmovq %k1, %rdi |
641 | |
642 | /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ |
643 | shrxq %SHIFT_REG64, %rdi, %rdi |
644 | testq %rdi, %rdi |
645 | je L(loop_cross_page_2_vec) |
646 | tzcntq %rdi, %rcx |
647 | # ifdef USE_AS_WCSCMP |
648 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
649 | sall $2, %ecx |
650 | # endif |
651 | # ifdef USE_AS_STRNCMP |
652 | cmpq %rcx, %r11 |
653 | jbe L(zero) |
654 | # ifdef USE_AS_WCSCMP |
655 | movq %rax, %rsi |
656 | xorl %eax, %eax |
657 | movl (%rsi, %rcx), %edi |
658 | cmpl (%rdx, %rcx), %edi |
659 | jne L(wcscmp_return) |
660 | # else |
661 | movzbl (%rax, %rcx), %eax |
662 | movzbl (%rdx, %rcx), %edx |
663 | subl %edx, %eax |
664 | # endif |
665 | # else |
666 | # ifdef USE_AS_WCSCMP |
667 | movq %rax, %rsi |
668 | xorl %eax, %eax |
669 | movl (%rsi, %rcx), %edi |
670 | cmpl (%rdx, %rcx), %edi |
671 | jne L(wcscmp_return) |
672 | # else |
673 | movzbl (%rax, %rcx), %eax |
674 | movzbl (%rdx, %rcx), %edx |
675 | subl %edx, %eax |
676 | # endif |
677 | # endif |
678 | ret |
679 | |
680 | .p2align 4 |
681 | L(loop_cross_page_2_vec): |
682 | /* The first VEC_SIZE * 2 bytes match or are ignored. */ |
683 | VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 |
684 | VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 |
685 | VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 |
686 | VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 |
687 | |
688 | VPCMP $4, %YMM0, %YMM2, %k0 |
689 | VPCMP $0, %YMMZERO, %YMM0, %k1 |
690 | VPCMP $0, %YMMZERO, %YMM2, %k2 |
691 | kord %k1, %k2, %k1 |
692 | /* Each bit in K1 represents a NULL or a mismatch in YMM0 and |
693 | YMM2. */ |
694 | kord %k0, %k1, %k1 |
695 | |
696 | VPCMP $4, %YMM1, %YMM3, %k3 |
697 | VPCMP $0, %YMMZERO, %YMM1, %k4 |
698 | VPCMP $0, %YMMZERO, %YMM3, %k5 |
699 | kord %k4, %k5, %k4 |
700 | /* Each bit in K3 represents a NULL or a mismatch in YMM1 and |
701 | YMM3. */ |
702 | kord %k3, %k4, %k3 |
703 | |
704 | # ifdef USE_AS_WCSCMP |
705 | /* NB: Each bit in K1/K3 represents 4-byte element. */ |
706 | kshiftlw $8, %k3, %k2 |
707 | # else |
708 | kshiftlq $32, %k3, %k2 |
709 | # endif |
710 | |
711 | /* Each bit in K1 represents a NULL or a mismatch. */ |
712 | korq %k1, %k2, %k1 |
713 | kmovq %k1, %rdi |
714 | |
715 | xorl %r8d, %r8d |
716 | /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ |
717 | subl $(VEC_SIZE * 2), %ecx |
718 | jle 1f |
719 | /* R8 has number of bytes skipped. */ |
720 | movl %ecx, %r8d |
721 | # ifdef USE_AS_WCSCMP |
722 | /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
723 | bytes. */ |
724 | sarl $2, %ecx |
725 | # endif |
726 | /* Skip ECX bytes. */ |
727 | shrq %cl, %rdi |
728 | 1: |
729 | /* Before jumping back to the loop, set ESI to the number of |
730 | VEC_SIZE * 4 blocks before page crossing. */ |
731 | movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi |
732 | |
733 | testq %rdi, %rdi |
734 | # ifdef USE_AS_STRNCMP |
735 | /* At this point, if %rdi value is 0, it already tested |
736 | VEC_SIZE*4+%r10 byte starting from %rax. This label |
737 | checks whether strncmp maximum offset reached or not. */ |
738 | je L(string_nbyte_offset_check) |
739 | # else |
740 | je L(back_to_loop) |
741 | # endif |
742 | tzcntq %rdi, %rcx |
743 | # ifdef USE_AS_WCSCMP |
744 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
745 | sall $2, %ecx |
746 | # endif |
747 | addq %r10, %rcx |
748 | /* Adjust for number of bytes skipped. */ |
749 | addq %r8, %rcx |
750 | # ifdef USE_AS_STRNCMP |
751 | addq $(VEC_SIZE * 2), %rcx |
752 | subq %rcx, %r11 |
753 | jbe L(zero) |
754 | # ifdef USE_AS_WCSCMP |
755 | movq %rax, %rsi |
756 | xorl %eax, %eax |
757 | movl (%rsi, %rcx), %edi |
758 | cmpl (%rdx, %rcx), %edi |
759 | jne L(wcscmp_return) |
760 | # else |
761 | movzbl (%rax, %rcx), %eax |
762 | movzbl (%rdx, %rcx), %edx |
763 | subl %edx, %eax |
764 | # endif |
765 | # else |
766 | # ifdef USE_AS_WCSCMP |
767 | movq %rax, %rsi |
768 | xorl %eax, %eax |
769 | movl (VEC_SIZE * 2)(%rsi, %rcx), %edi |
770 | cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi |
771 | jne L(wcscmp_return) |
772 | # else |
773 | movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax |
774 | movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx |
775 | subl %edx, %eax |
776 | # endif |
777 | # endif |
778 | ret |
779 | |
780 | # ifdef USE_AS_STRNCMP |
781 | L(string_nbyte_offset_check): |
782 | leaq (VEC_SIZE * 4)(%r10), %r10 |
783 | cmpq %r10, %r11 |
784 | jbe L(zero) |
785 | jmp L(back_to_loop) |
786 | # endif |
787 | |
788 | .p2align 4 |
789 | L(cross_page_loop): |
790 | /* Check one byte/dword at a time. */ |
791 | # ifdef USE_AS_WCSCMP |
792 | cmpl %ecx, %eax |
793 | # else |
794 | subl %ecx, %eax |
795 | # endif |
796 | jne L(different) |
797 | addl $SIZE_OF_CHAR, %edx |
798 | cmpl $(VEC_SIZE * 4), %edx |
799 | je L(main_loop_header) |
800 | # ifdef USE_AS_STRNCMP |
801 | cmpq %r11, %rdx |
802 | jae L(zero) |
803 | # endif |
804 | # ifdef USE_AS_WCSCMP |
805 | movl (%rdi, %rdx), %eax |
806 | movl (%rsi, %rdx), %ecx |
807 | # else |
808 | movzbl (%rdi, %rdx), %eax |
809 | movzbl (%rsi, %rdx), %ecx |
810 | # endif |
811 | /* Check null char. */ |
812 | testl %eax, %eax |
813 | jne L(cross_page_loop) |
814 | /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED |
815 | comparisons. */ |
816 | subl %ecx, %eax |
817 | # ifndef USE_AS_WCSCMP |
818 | L(different): |
819 | # endif |
820 | ret |
821 | |
822 | # ifdef USE_AS_WCSCMP |
823 | .p2align 4 |
824 | L(different): |
825 | /* Use movl to avoid modifying EFLAGS. */ |
826 | movl $0, %eax |
827 | setl %al |
828 | negl %eax |
829 | orl $1, %eax |
830 | ret |
831 | # endif |
832 | |
833 | # ifdef USE_AS_STRNCMP |
834 | .p2align 4 |
835 | L(zero): |
836 | xorl %eax, %eax |
837 | ret |
838 | |
839 | .p2align 4 |
840 | L(char0): |
841 | # ifdef USE_AS_WCSCMP |
842 | xorl %eax, %eax |
843 | movl (%rdi), %ecx |
844 | cmpl (%rsi), %ecx |
845 | jne L(wcscmp_return) |
846 | # else |
847 | movzbl (%rsi), %ecx |
848 | movzbl (%rdi), %eax |
849 | subl %ecx, %eax |
850 | # endif |
851 | ret |
852 | # endif |
853 | |
854 | .p2align 4 |
855 | L(last_vector): |
856 | addq %rdx, %rdi |
857 | addq %rdx, %rsi |
858 | # ifdef USE_AS_STRNCMP |
859 | subq %rdx, %r11 |
860 | # endif |
861 | tzcntl %ecx, %edx |
862 | # ifdef USE_AS_WCSCMP |
863 | /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
864 | sall $2, %edx |
865 | # endif |
866 | # ifdef USE_AS_STRNCMP |
867 | cmpq %r11, %rdx |
868 | jae L(zero) |
869 | # endif |
870 | # ifdef USE_AS_WCSCMP |
871 | xorl %eax, %eax |
872 | movl (%rdi, %rdx), %ecx |
873 | cmpl (%rsi, %rdx), %ecx |
874 | jne L(wcscmp_return) |
875 | # else |
876 | movzbl (%rdi, %rdx), %eax |
877 | movzbl (%rsi, %rdx), %edx |
878 | subl %edx, %eax |
879 | # endif |
880 | ret |
881 | |
882 | /* Comparing on page boundary region requires special treatment: |
883 | It must done one vector at the time, starting with the wider |
884 | ymm vector if possible, if not, with xmm. If fetching 16 bytes |
885 | (xmm) still passes the boundary, byte comparison must be done. |
886 | */ |
887 | .p2align 4 |
888 | L(cross_page): |
889 | /* Try one ymm vector at a time. */ |
890 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
891 | jg L(cross_page_1_vector) |
892 | L(loop_1_vector): |
893 | VMOVU (%rdi, %rdx), %YMM0 |
894 | VMOVU (%rsi, %rdx), %YMM1 |
895 | |
896 | /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ |
897 | VPCMP $4, %YMM0, %YMM1, %k0 |
898 | VPCMP $0, %YMMZERO, %YMM0, %k1 |
899 | VPCMP $0, %YMMZERO, %YMM1, %k2 |
900 | /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ |
901 | kord %k1, %k2, %k1 |
902 | /* Each bit in K1 represents a NULL or a mismatch. */ |
903 | kord %k0, %k1, %k1 |
904 | kmovd %k1, %ecx |
905 | testl %ecx, %ecx |
906 | jne L(last_vector) |
907 | |
908 | addl $VEC_SIZE, %edx |
909 | |
910 | addl $VEC_SIZE, %eax |
911 | # ifdef USE_AS_STRNCMP |
912 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
913 | (%r11). */ |
914 | cmpq %r11, %rdx |
915 | jae L(zero) |
916 | # endif |
917 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
918 | jle L(loop_1_vector) |
919 | L(cross_page_1_vector): |
920 | /* Less than 32 bytes to check, try one xmm vector. */ |
921 | cmpl $(PAGE_SIZE - 16), %eax |
922 | jg L(cross_page_1_xmm) |
923 | VMOVU (%rdi, %rdx), %XMM0 |
924 | VMOVU (%rsi, %rdx), %XMM1 |
925 | |
926 | /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ |
927 | VPCMP $4, %XMM0, %XMM1, %k0 |
928 | VPCMP $0, %XMMZERO, %XMM0, %k1 |
929 | VPCMP $0, %XMMZERO, %XMM1, %k2 |
930 | /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ |
931 | korw %k1, %k2, %k1 |
932 | /* Each bit in K1 represents a NULL or a mismatch. */ |
933 | korw %k0, %k1, %k1 |
934 | kmovw %k1, %ecx |
935 | testl %ecx, %ecx |
936 | jne L(last_vector) |
937 | |
938 | addl $16, %edx |
939 | # ifndef USE_AS_WCSCMP |
940 | addl $16, %eax |
941 | # endif |
942 | # ifdef USE_AS_STRNCMP |
943 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
944 | (%r11). */ |
945 | cmpq %r11, %rdx |
946 | jae L(zero) |
947 | # endif |
948 | |
949 | L(cross_page_1_xmm): |
950 | # ifndef USE_AS_WCSCMP |
951 | /* Less than 16 bytes to check, try 8 byte vector. NB: No need |
952 | for wcscmp nor wcsncmp since wide char is 4 bytes. */ |
953 | cmpl $(PAGE_SIZE - 8), %eax |
954 | jg L(cross_page_8bytes) |
955 | vmovq (%rdi, %rdx), %XMM0 |
956 | vmovq (%rsi, %rdx), %XMM1 |
957 | |
958 | /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ |
959 | VPCMP $4, %XMM0, %XMM1, %k0 |
960 | VPCMP $0, %XMMZERO, %XMM0, %k1 |
961 | VPCMP $0, %XMMZERO, %XMM1, %k2 |
962 | /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ |
963 | kord %k1, %k2, %k1 |
964 | /* Each bit in K1 represents a NULL or a mismatch. */ |
965 | kord %k0, %k1, %k1 |
966 | kmovd %k1, %ecx |
967 | |
968 | # ifdef USE_AS_WCSCMP |
969 | /* Only last 2 bits are valid. */ |
970 | andl $0x3, %ecx |
971 | # else |
972 | /* Only last 8 bits are valid. */ |
973 | andl $0xff, %ecx |
974 | # endif |
975 | |
976 | testl %ecx, %ecx |
977 | jne L(last_vector) |
978 | |
979 | addl $8, %edx |
980 | addl $8, %eax |
981 | # ifdef USE_AS_STRNCMP |
982 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
983 | (%r11). */ |
984 | cmpq %r11, %rdx |
985 | jae L(zero) |
986 | # endif |
987 | |
988 | L(cross_page_8bytes): |
989 | /* Less than 8 bytes to check, try 4 byte vector. */ |
990 | cmpl $(PAGE_SIZE - 4), %eax |
991 | jg L(cross_page_4bytes) |
992 | vmovd (%rdi, %rdx), %XMM0 |
993 | vmovd (%rsi, %rdx), %XMM1 |
994 | |
995 | /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ |
996 | VPCMP $4, %XMM0, %XMM1, %k0 |
997 | VPCMP $0, %XMMZERO, %XMM0, %k1 |
998 | VPCMP $0, %XMMZERO, %XMM1, %k2 |
999 | /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ |
1000 | kord %k1, %k2, %k1 |
1001 | /* Each bit in K1 represents a NULL or a mismatch. */ |
1002 | kord %k0, %k1, %k1 |
1003 | kmovd %k1, %ecx |
1004 | |
1005 | # ifdef USE_AS_WCSCMP |
1006 | /* Only the last bit is valid. */ |
1007 | andl $0x1, %ecx |
1008 | # else |
1009 | /* Only last 4 bits are valid. */ |
1010 | andl $0xf, %ecx |
1011 | # endif |
1012 | |
1013 | testl %ecx, %ecx |
1014 | jne L(last_vector) |
1015 | |
1016 | addl $4, %edx |
1017 | # ifdef USE_AS_STRNCMP |
1018 | /* Return 0 if the current offset (%rdx) >= the maximum offset |
1019 | (%r11). */ |
1020 | cmpq %r11, %rdx |
1021 | jae L(zero) |
1022 | # endif |
1023 | |
1024 | L(cross_page_4bytes): |
1025 | # endif |
1026 | /* Less than 4 bytes to check, try one byte/dword at a time. */ |
1027 | # ifdef USE_AS_STRNCMP |
1028 | cmpq %r11, %rdx |
1029 | jae L(zero) |
1030 | # endif |
1031 | # ifdef USE_AS_WCSCMP |
1032 | movl (%rdi, %rdx), %eax |
1033 | movl (%rsi, %rdx), %ecx |
1034 | # else |
1035 | movzbl (%rdi, %rdx), %eax |
1036 | movzbl (%rsi, %rdx), %ecx |
1037 | # endif |
1038 | testl %eax, %eax |
1039 | jne L(cross_page_loop) |
1040 | subl %ecx, %eax |
1041 | ret |
1042 | END (STRCMP) |
1043 | #endif |
1044 | |