1/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCMP
24# define STRCMP __strcmp_evex
25# endif
26
27# define PAGE_SIZE 4096
28
29/* VEC_SIZE = Number of bytes in a ymm register */
30# define VEC_SIZE 32
31
32/* Shift for dividing by (VEC_SIZE * 4). */
33# define DIVIDE_BY_VEC_4_SHIFT 7
34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36# endif
37
38# define VMOVU vmovdqu64
39# define VMOVA vmovdqa64
40
41# ifdef USE_AS_WCSCMP
42/* Compare packed dwords. */
43# define VPCMP vpcmpd
44# define VPMINU vpminud
45# define VPTESTM vptestmd
46# define SHIFT_REG32 r8d
47# define SHIFT_REG64 r8
48/* 1 dword char == 4 bytes. */
49# define SIZE_OF_CHAR 4
50# else
51/* Compare packed bytes. */
52# define VPCMP vpcmpb
53# define VPMINU vpminub
54# define VPTESTM vptestmb
55# define SHIFT_REG32 ecx
56# define SHIFT_REG64 rcx
57/* 1 byte char == 1 byte. */
58# define SIZE_OF_CHAR 1
59# endif
60
61# define XMMZERO xmm16
62# define XMM0 xmm17
63# define XMM1 xmm18
64
65# define YMMZERO ymm16
66# define YMM0 ymm17
67# define YMM1 ymm18
68# define YMM2 ymm19
69# define YMM3 ymm20
70# define YMM4 ymm21
71# define YMM5 ymm22
72# define YMM6 ymm23
73# define YMM7 ymm24
74# define YMM8 ymm25
75# define YMM9 ymm26
76# define YMM10 ymm27
77
78/* Warning!
79 wcscmp/wcsncmp have to use SIGNED comparison for elements.
80 strcmp/strncmp have to use UNSIGNED comparison for elements.
81*/
82
83/* The main idea of the string comparison (byte or dword) using 256-bit
84 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
85 latter can be on either packed bytes or dwords depending on
86 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
87 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
88 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
89 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
90 instructions. Main loop (away from from page boundary) compares 4
91 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
92 bytes) on each loop.
93
94 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
95 is the same as strcmp, except that an a maximum offset is tracked. If
96 the maximum offset is reached before a difference is found, zero is
97 returned. */
98
99 .section .text.evex,"ax",@progbits
100ENTRY (STRCMP)
101# ifdef USE_AS_STRNCMP
102 /* Check for simple cases (0 or 1) in offset. */
103 cmp $1, %RDX_LP
104 je L(char0)
105 jb L(zero)
106# ifdef USE_AS_WCSCMP
107# ifndef __ILP32__
108 movq %rdx, %rcx
109 /* Check if length could overflow when multiplied by
110 sizeof(wchar_t). Checking top 8 bits will cover all potential
111 overflow cases as well as redirect cases where its impossible to
112 length to bound a valid memory region. In these cases just use
113 'wcscmp'. */
114 shrq $56, %rcx
115 jnz __wcscmp_evex
116# endif
117 /* Convert units: from wide to byte char. */
118 shl $2, %RDX_LP
119# endif
120 /* Register %r11 tracks the maximum offset. */
121 mov %RDX_LP, %R11_LP
122# endif
123 movl %edi, %eax
124 xorl %edx, %edx
125 /* Make %XMMZERO (%YMMZERO) all zeros in this function. */
126 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
127 orl %esi, %eax
128 andl $(PAGE_SIZE - 1), %eax
129 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
130 jg L(cross_page)
131 /* Start comparing 4 vectors. */
132 VMOVU (%rdi), %YMM0
133
134 /* Each bit set in K2 represents a non-null CHAR in YMM0. */
135 VPTESTM %YMM0, %YMM0, %k2
136
137 /* Each bit cleared in K1 represents a mismatch or a null CHAR
138 in YMM0 and 32 bytes at (%rsi). */
139 VPCMP $0, (%rsi), %YMM0, %k1{%k2}
140
141 kmovd %k1, %ecx
142# ifdef USE_AS_WCSCMP
143 subl $0xff, %ecx
144# else
145 incl %ecx
146# endif
147 je L(next_3_vectors)
148 tzcntl %ecx, %edx
149# ifdef USE_AS_WCSCMP
150 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
151 sall $2, %edx
152# endif
153# ifdef USE_AS_STRNCMP
154 /* Return 0 if the mismatched index (%rdx) is after the maximum
155 offset (%r11). */
156 cmpq %r11, %rdx
157 jae L(zero)
158# endif
159# ifdef USE_AS_WCSCMP
160 xorl %eax, %eax
161 movl (%rdi, %rdx), %ecx
162 cmpl (%rsi, %rdx), %ecx
163 je L(return)
164L(wcscmp_return):
165 setl %al
166 negl %eax
167 orl $1, %eax
168L(return):
169# else
170 movzbl (%rdi, %rdx), %eax
171 movzbl (%rsi, %rdx), %edx
172 subl %edx, %eax
173# endif
174 ret
175
176L(return_vec_size):
177 tzcntl %ecx, %edx
178# ifdef USE_AS_WCSCMP
179 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
180 sall $2, %edx
181# endif
182# ifdef USE_AS_STRNCMP
183 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
184 the maximum offset (%r11). */
185 addq $VEC_SIZE, %rdx
186 cmpq %r11, %rdx
187 jae L(zero)
188# ifdef USE_AS_WCSCMP
189 xorl %eax, %eax
190 movl (%rdi, %rdx), %ecx
191 cmpl (%rsi, %rdx), %ecx
192 jne L(wcscmp_return)
193# else
194 movzbl (%rdi, %rdx), %eax
195 movzbl (%rsi, %rdx), %edx
196 subl %edx, %eax
197# endif
198# else
199# ifdef USE_AS_WCSCMP
200 xorl %eax, %eax
201 movl VEC_SIZE(%rdi, %rdx), %ecx
202 cmpl VEC_SIZE(%rsi, %rdx), %ecx
203 jne L(wcscmp_return)
204# else
205 movzbl VEC_SIZE(%rdi, %rdx), %eax
206 movzbl VEC_SIZE(%rsi, %rdx), %edx
207 subl %edx, %eax
208# endif
209# endif
210 ret
211
212L(return_2_vec_size):
213 tzcntl %ecx, %edx
214# ifdef USE_AS_WCSCMP
215 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
216 sall $2, %edx
217# endif
218# ifdef USE_AS_STRNCMP
219 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
220 after the maximum offset (%r11). */
221 addq $(VEC_SIZE * 2), %rdx
222 cmpq %r11, %rdx
223 jae L(zero)
224# ifdef USE_AS_WCSCMP
225 xorl %eax, %eax
226 movl (%rdi, %rdx), %ecx
227 cmpl (%rsi, %rdx), %ecx
228 jne L(wcscmp_return)
229# else
230 movzbl (%rdi, %rdx), %eax
231 movzbl (%rsi, %rdx), %edx
232 subl %edx, %eax
233# endif
234# else
235# ifdef USE_AS_WCSCMP
236 xorl %eax, %eax
237 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
238 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
239 jne L(wcscmp_return)
240# else
241 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
242 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
243 subl %edx, %eax
244# endif
245# endif
246 ret
247
248L(return_3_vec_size):
249 tzcntl %ecx, %edx
250# ifdef USE_AS_WCSCMP
251 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
252 sall $2, %edx
253# endif
254# ifdef USE_AS_STRNCMP
255 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
256 after the maximum offset (%r11). */
257 addq $(VEC_SIZE * 3), %rdx
258 cmpq %r11, %rdx
259 jae L(zero)
260# ifdef USE_AS_WCSCMP
261 xorl %eax, %eax
262 movl (%rdi, %rdx), %ecx
263 cmpl (%rsi, %rdx), %ecx
264 jne L(wcscmp_return)
265# else
266 movzbl (%rdi, %rdx), %eax
267 movzbl (%rsi, %rdx), %edx
268 subl %edx, %eax
269# endif
270# else
271# ifdef USE_AS_WCSCMP
272 xorl %eax, %eax
273 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
274 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
275 jne L(wcscmp_return)
276# else
277 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
278 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
279 subl %edx, %eax
280# endif
281# endif
282 ret
283
284 .p2align 4
285L(next_3_vectors):
286 VMOVU VEC_SIZE(%rdi), %YMM0
287 /* Each bit set in K2 represents a non-null CHAR in YMM0. */
288 VPTESTM %YMM0, %YMM0, %k2
289 /* Each bit cleared in K1 represents a mismatch or a null CHAR
290 in YMM0 and 32 bytes at VEC_SIZE(%rsi). */
291 VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
292 kmovd %k1, %ecx
293# ifdef USE_AS_WCSCMP
294 subl $0xff, %ecx
295# else
296 incl %ecx
297# endif
298 jne L(return_vec_size)
299
300 VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
301 /* Each bit set in K2 represents a non-null CHAR in YMM0. */
302 VPTESTM %YMM0, %YMM0, %k2
303 /* Each bit cleared in K1 represents a mismatch or a null CHAR
304 in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
305 VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
306 kmovd %k1, %ecx
307# ifdef USE_AS_WCSCMP
308 subl $0xff, %ecx
309# else
310 incl %ecx
311# endif
312 jne L(return_2_vec_size)
313
314 VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
315 /* Each bit set in K2 represents a non-null CHAR in YMM0. */
316 VPTESTM %YMM0, %YMM0, %k2
317 /* Each bit cleared in K1 represents a mismatch or a null CHAR
318 in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
319 VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
320 kmovd %k1, %ecx
321# ifdef USE_AS_WCSCMP
322 subl $0xff, %ecx
323# else
324 incl %ecx
325# endif
326 jne L(return_3_vec_size)
327L(main_loop_header):
328 leaq (VEC_SIZE * 4)(%rdi), %rdx
329 movl $PAGE_SIZE, %ecx
330 /* Align load via RAX. */
331 andq $-(VEC_SIZE * 4), %rdx
332 subq %rdi, %rdx
333 leaq (%rdi, %rdx), %rax
334# ifdef USE_AS_STRNCMP
335 /* Starting from this point, the maximum offset, or simply the
336 'offset', DECREASES by the same amount when base pointers are
337 moved forward. Return 0 when:
338 1) On match: offset <= the matched vector index.
339 2) On mistmach, offset is before the mistmatched index.
340 */
341 subq %rdx, %r11
342 jbe L(zero)
343# endif
344 addq %rsi, %rdx
345 movq %rdx, %rsi
346 andl $(PAGE_SIZE - 1), %esi
347 /* Number of bytes before page crossing. */
348 subq %rsi, %rcx
349 /* Number of VEC_SIZE * 4 blocks before page crossing. */
350 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
351 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
352 movl %ecx, %esi
353 jmp L(loop_start)
354
355 .p2align 4
356L(loop):
357# ifdef USE_AS_STRNCMP
358 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
359 the maximum offset (%r11) by the same amount. */
360 subq $(VEC_SIZE * 4), %r11
361 jbe L(zero)
362# endif
363 addq $(VEC_SIZE * 4), %rax
364 addq $(VEC_SIZE * 4), %rdx
365L(loop_start):
366 testl %esi, %esi
367 leal -1(%esi), %esi
368 je L(loop_cross_page)
369L(back_to_loop):
370 /* Main loop, comparing 4 vectors are a time. */
371 VMOVA (%rax), %YMM0
372 VMOVA VEC_SIZE(%rax), %YMM2
373 VMOVA (VEC_SIZE * 2)(%rax), %YMM4
374 VMOVA (VEC_SIZE * 3)(%rax), %YMM6
375
376 VPMINU %YMM0, %YMM2, %YMM8
377 VPMINU %YMM4, %YMM6, %YMM9
378
379 /* A zero CHAR in YMM8 means that there is a null CHAR. */
380 VPMINU %YMM8, %YMM9, %YMM8
381
382 /* Each bit set in K1 represents a non-null CHAR in YMM8. */
383 VPTESTM %YMM8, %YMM8, %k1
384
385 /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */
386 vpxorq (%rdx), %YMM0, %YMM1
387 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3
388 vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
389 vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
390
391 vporq %YMM1, %YMM3, %YMM9
392 vporq %YMM5, %YMM7, %YMM10
393
394 /* A non-zero CHAR in YMM9 represents a mismatch. */
395 vporq %YMM9, %YMM10, %YMM9
396
397 /* Each bit cleared in K0 represents a mismatch or a null CHAR. */
398 VPCMP $0, %YMMZERO, %YMM9, %k0{%k1}
399 kmovd %k0, %ecx
400# ifdef USE_AS_WCSCMP
401 subl $0xff, %ecx
402# else
403 incl %ecx
404# endif
405 je L(loop)
406
407 /* Each bit set in K1 represents a non-null CHAR in YMM0. */
408 VPTESTM %YMM0, %YMM0, %k1
409 /* Each bit cleared in K0 represents a mismatch or a null CHAR
410 in YMM0 and (%rdx). */
411 VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
412 kmovd %k0, %ecx
413# ifdef USE_AS_WCSCMP
414 subl $0xff, %ecx
415# else
416 incl %ecx
417# endif
418 je L(test_vec)
419 tzcntl %ecx, %ecx
420# ifdef USE_AS_WCSCMP
421 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
422 sall $2, %ecx
423# endif
424# ifdef USE_AS_STRNCMP
425 cmpq %rcx, %r11
426 jbe L(zero)
427# ifdef USE_AS_WCSCMP
428 movq %rax, %rsi
429 xorl %eax, %eax
430 movl (%rsi, %rcx), %edi
431 cmpl (%rdx, %rcx), %edi
432 jne L(wcscmp_return)
433# else
434 movzbl (%rax, %rcx), %eax
435 movzbl (%rdx, %rcx), %edx
436 subl %edx, %eax
437# endif
438# else
439# ifdef USE_AS_WCSCMP
440 movq %rax, %rsi
441 xorl %eax, %eax
442 movl (%rsi, %rcx), %edi
443 cmpl (%rdx, %rcx), %edi
444 jne L(wcscmp_return)
445# else
446 movzbl (%rax, %rcx), %eax
447 movzbl (%rdx, %rcx), %edx
448 subl %edx, %eax
449# endif
450# endif
451 ret
452
453 .p2align 4
454L(test_vec):
455# ifdef USE_AS_STRNCMP
456 /* The first vector matched. Return 0 if the maximum offset
457 (%r11) <= VEC_SIZE. */
458 cmpq $VEC_SIZE, %r11
459 jbe L(zero)
460# endif
461 /* Each bit set in K1 represents a non-null CHAR in YMM2. */
462 VPTESTM %YMM2, %YMM2, %k1
463 /* Each bit cleared in K0 represents a mismatch or a null CHAR
464 in YMM2 and VEC_SIZE(%rdx). */
465 VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
466 kmovd %k0, %ecx
467# ifdef USE_AS_WCSCMP
468 subl $0xff, %ecx
469# else
470 incl %ecx
471# endif
472 je L(test_2_vec)
473 tzcntl %ecx, %edi
474# ifdef USE_AS_WCSCMP
475 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
476 sall $2, %edi
477# endif
478# ifdef USE_AS_STRNCMP
479 addq $VEC_SIZE, %rdi
480 cmpq %rdi, %r11
481 jbe L(zero)
482# ifdef USE_AS_WCSCMP
483 movq %rax, %rsi
484 xorl %eax, %eax
485 movl (%rsi, %rdi), %ecx
486 cmpl (%rdx, %rdi), %ecx
487 jne L(wcscmp_return)
488# else
489 movzbl (%rax, %rdi), %eax
490 movzbl (%rdx, %rdi), %edx
491 subl %edx, %eax
492# endif
493# else
494# ifdef USE_AS_WCSCMP
495 movq %rax, %rsi
496 xorl %eax, %eax
497 movl VEC_SIZE(%rsi, %rdi), %ecx
498 cmpl VEC_SIZE(%rdx, %rdi), %ecx
499 jne L(wcscmp_return)
500# else
501 movzbl VEC_SIZE(%rax, %rdi), %eax
502 movzbl VEC_SIZE(%rdx, %rdi), %edx
503 subl %edx, %eax
504# endif
505# endif
506 ret
507
508 .p2align 4
509L(test_2_vec):
510# ifdef USE_AS_STRNCMP
511 /* The first 2 vectors matched. Return 0 if the maximum offset
512 (%r11) <= 2 * VEC_SIZE. */
513 cmpq $(VEC_SIZE * 2), %r11
514 jbe L(zero)
515# endif
516 /* Each bit set in K1 represents a non-null CHAR in YMM4. */
517 VPTESTM %YMM4, %YMM4, %k1
518 /* Each bit cleared in K0 represents a mismatch or a null CHAR
519 in YMM4 and (VEC_SIZE * 2)(%rdx). */
520 VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
521 kmovd %k0, %ecx
522# ifdef USE_AS_WCSCMP
523 subl $0xff, %ecx
524# else
525 incl %ecx
526# endif
527 je L(test_3_vec)
528 tzcntl %ecx, %edi
529# ifdef USE_AS_WCSCMP
530 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
531 sall $2, %edi
532# endif
533# ifdef USE_AS_STRNCMP
534 addq $(VEC_SIZE * 2), %rdi
535 cmpq %rdi, %r11
536 jbe L(zero)
537# ifdef USE_AS_WCSCMP
538 movq %rax, %rsi
539 xorl %eax, %eax
540 movl (%rsi, %rdi), %ecx
541 cmpl (%rdx, %rdi), %ecx
542 jne L(wcscmp_return)
543# else
544 movzbl (%rax, %rdi), %eax
545 movzbl (%rdx, %rdi), %edx
546 subl %edx, %eax
547# endif
548# else
549# ifdef USE_AS_WCSCMP
550 movq %rax, %rsi
551 xorl %eax, %eax
552 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
553 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
554 jne L(wcscmp_return)
555# else
556 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
557 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
558 subl %edx, %eax
559# endif
560# endif
561 ret
562
563 .p2align 4
564L(test_3_vec):
565# ifdef USE_AS_STRNCMP
566 /* The first 3 vectors matched. Return 0 if the maximum offset
567 (%r11) <= 3 * VEC_SIZE. */
568 cmpq $(VEC_SIZE * 3), %r11
569 jbe L(zero)
570# endif
571 /* Each bit set in K1 represents a non-null CHAR in YMM6. */
572 VPTESTM %YMM6, %YMM6, %k1
573 /* Each bit cleared in K0 represents a mismatch or a null CHAR
574 in YMM6 and (VEC_SIZE * 3)(%rdx). */
575 VPCMP $0, %YMMZERO, %YMM7, %k0{%k1}
576 kmovd %k0, %ecx
577# ifdef USE_AS_WCSCMP
578 subl $0xff, %ecx
579# else
580 incl %ecx
581# endif
582 tzcntl %ecx, %ecx
583# ifdef USE_AS_WCSCMP
584 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
585 sall $2, %ecx
586# endif
587# ifdef USE_AS_STRNCMP
588 addq $(VEC_SIZE * 3), %rcx
589 cmpq %rcx, %r11
590 jbe L(zero)
591# ifdef USE_AS_WCSCMP
592 movq %rax, %rsi
593 xorl %eax, %eax
594 movl (%rsi, %rcx), %esi
595 cmpl (%rdx, %rcx), %esi
596 jne L(wcscmp_return)
597# else
598 movzbl (%rax, %rcx), %eax
599 movzbl (%rdx, %rcx), %edx
600 subl %edx, %eax
601# endif
602# else
603# ifdef USE_AS_WCSCMP
604 movq %rax, %rsi
605 xorl %eax, %eax
606 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
607 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
608 jne L(wcscmp_return)
609# else
610 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
611 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
612 subl %edx, %eax
613# endif
614# endif
615 ret
616
617 .p2align 4
618L(loop_cross_page):
619 xorl %r10d, %r10d
620 movq %rdx, %rcx
621 /* Align load via RDX. We load the extra ECX bytes which should
622 be ignored. */
623 andl $((VEC_SIZE * 4) - 1), %ecx
624 /* R10 is -RCX. */
625 subq %rcx, %r10
626
627 /* This works only if VEC_SIZE * 2 == 64. */
628# if (VEC_SIZE * 2) != 64
629# error (VEC_SIZE * 2) != 64
630# endif
631
632 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
633 cmpl $(VEC_SIZE * 2), %ecx
634 jge L(loop_cross_page_2_vec)
635
636 VMOVU (%rax, %r10), %YMM2
637 VMOVU VEC_SIZE(%rax, %r10), %YMM3
638
639 /* Each bit set in K2 represents a non-null CHAR in YMM2. */
640 VPTESTM %YMM2, %YMM2, %k2
641 /* Each bit cleared in K1 represents a mismatch or a null CHAR
642 in YMM2 and 32 bytes at (%rdx, %r10). */
643 VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2}
644 kmovd %k1, %r9d
645 /* Don't use subl since it is the lower 16/32 bits of RDI
646 below. */
647 notl %r9d
648# ifdef USE_AS_WCSCMP
649 /* Only last 8 bits are valid. */
650 andl $0xff, %r9d
651# endif
652
653 /* Each bit set in K4 represents a non-null CHAR in YMM3. */
654 VPTESTM %YMM3, %YMM3, %k4
655 /* Each bit cleared in K3 represents a mismatch or a null CHAR
656 in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
657 VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
658 kmovd %k3, %edi
659 /* Must use notl %edi here as lower bits are for CHAR
660 comparisons potentially out of range thus can be 0 without
661 indicating mismatch. */
662 notl %edi
663# ifdef USE_AS_WCSCMP
664 /* Don't use subl since it is the upper 8 bits of EDI below. */
665 andl $0xff, %edi
666# endif
667
668# ifdef USE_AS_WCSCMP
669 /* NB: Each bit in EDI/R9D represents 4-byte element. */
670 sall $8, %edi
671 /* NB: Divide shift count by 4 since each bit in K1 represent 4
672 bytes. */
673 movl %ecx, %SHIFT_REG32
674 sarl $2, %SHIFT_REG32
675
676 /* Each bit in EDI represents a null CHAR or a mismatch. */
677 orl %r9d, %edi
678# else
679 salq $32, %rdi
680
681 /* Each bit in RDI represents a null CHAR or a mismatch. */
682 orq %r9, %rdi
683# endif
684
685 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
686 shrxq %SHIFT_REG64, %rdi, %rdi
687 testq %rdi, %rdi
688 je L(loop_cross_page_2_vec)
689 tzcntq %rdi, %rcx
690# ifdef USE_AS_WCSCMP
691 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
692 sall $2, %ecx
693# endif
694# ifdef USE_AS_STRNCMP
695 cmpq %rcx, %r11
696 jbe L(zero)
697# ifdef USE_AS_WCSCMP
698 movq %rax, %rsi
699 xorl %eax, %eax
700 movl (%rsi, %rcx), %edi
701 cmpl (%rdx, %rcx), %edi
702 jne L(wcscmp_return)
703# else
704 movzbl (%rax, %rcx), %eax
705 movzbl (%rdx, %rcx), %edx
706 subl %edx, %eax
707# endif
708# else
709# ifdef USE_AS_WCSCMP
710 movq %rax, %rsi
711 xorl %eax, %eax
712 movl (%rsi, %rcx), %edi
713 cmpl (%rdx, %rcx), %edi
714 jne L(wcscmp_return)
715# else
716 movzbl (%rax, %rcx), %eax
717 movzbl (%rdx, %rcx), %edx
718 subl %edx, %eax
719# endif
720# endif
721 ret
722
723 .p2align 4
724L(loop_cross_page_2_vec):
725 /* The first VEC_SIZE * 2 bytes match or are ignored. */
726 VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
727 VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
728
729 VPTESTM %YMM0, %YMM0, %k2
730 /* Each bit cleared in K1 represents a mismatch or a null CHAR
731 in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */
732 VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
733 kmovd %k1, %r9d
734 /* Don't use subl since it is the lower 16/32 bits of RDI
735 below. */
736 notl %r9d
737# ifdef USE_AS_WCSCMP
738 /* Only last 8 bits are valid. */
739 andl $0xff, %r9d
740# endif
741
742 VPTESTM %YMM1, %YMM1, %k4
743 /* Each bit cleared in K3 represents a mismatch or a null CHAR
744 in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
745 VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
746 kmovd %k3, %edi
747 /* Must use notl %edi here as lower bits are for CHAR
748 comparisons potentially out of range thus can be 0 without
749 indicating mismatch. */
750 notl %edi
751# ifdef USE_AS_WCSCMP
752 /* Don't use subl since it is the upper 8 bits of EDI below. */
753 andl $0xff, %edi
754# endif
755
756# ifdef USE_AS_WCSCMP
757 /* NB: Each bit in EDI/R9D represents 4-byte element. */
758 sall $8, %edi
759
760 /* Each bit in EDI represents a null CHAR or a mismatch. */
761 orl %r9d, %edi
762# else
763 salq $32, %rdi
764
765 /* Each bit in RDI represents a null CHAR or a mismatch. */
766 orq %r9, %rdi
767# endif
768
769 xorl %r8d, %r8d
770 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
771 subl $(VEC_SIZE * 2), %ecx
772 jle 1f
773 /* R8 has number of bytes skipped. */
774 movl %ecx, %r8d
775# ifdef USE_AS_WCSCMP
776 /* NB: Divide shift count by 4 since each bit in RDI represent 4
777 bytes. */
778 sarl $2, %ecx
779 /* Skip ECX bytes. */
780 shrl %cl, %edi
781# else
782 /* Skip ECX bytes. */
783 shrq %cl, %rdi
784# endif
7851:
786 /* Before jumping back to the loop, set ESI to the number of
787 VEC_SIZE * 4 blocks before page crossing. */
788 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
789
790 testq %rdi, %rdi
791# ifdef USE_AS_STRNCMP
792 /* At this point, if %rdi value is 0, it already tested
793 VEC_SIZE*4+%r10 byte starting from %rax. This label
794 checks whether strncmp maximum offset reached or not. */
795 je L(string_nbyte_offset_check)
796# else
797 je L(back_to_loop)
798# endif
799 tzcntq %rdi, %rcx
800# ifdef USE_AS_WCSCMP
801 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
802 sall $2, %ecx
803# endif
804 addq %r10, %rcx
805 /* Adjust for number of bytes skipped. */
806 addq %r8, %rcx
807# ifdef USE_AS_STRNCMP
808 addq $(VEC_SIZE * 2), %rcx
809 subq %rcx, %r11
810 jbe L(zero)
811# ifdef USE_AS_WCSCMP
812 movq %rax, %rsi
813 xorl %eax, %eax
814 movl (%rsi, %rcx), %edi
815 cmpl (%rdx, %rcx), %edi
816 jne L(wcscmp_return)
817# else
818 movzbl (%rax, %rcx), %eax
819 movzbl (%rdx, %rcx), %edx
820 subl %edx, %eax
821# endif
822# else
823# ifdef USE_AS_WCSCMP
824 movq %rax, %rsi
825 xorl %eax, %eax
826 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
827 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
828 jne L(wcscmp_return)
829# else
830 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
831 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
832 subl %edx, %eax
833# endif
834# endif
835 ret
836
837# ifdef USE_AS_STRNCMP
838L(string_nbyte_offset_check):
839 leaq (VEC_SIZE * 4)(%r10), %r10
840 cmpq %r10, %r11
841 jbe L(zero)
842 jmp L(back_to_loop)
843# endif
844
845 .p2align 4
846L(cross_page_loop):
847 /* Check one byte/dword at a time. */
848# ifdef USE_AS_WCSCMP
849 cmpl %ecx, %eax
850# else
851 subl %ecx, %eax
852# endif
853 jne L(different)
854 addl $SIZE_OF_CHAR, %edx
855 cmpl $(VEC_SIZE * 4), %edx
856 je L(main_loop_header)
857# ifdef USE_AS_STRNCMP
858 cmpq %r11, %rdx
859 jae L(zero)
860# endif
861# ifdef USE_AS_WCSCMP
862 movl (%rdi, %rdx), %eax
863 movl (%rsi, %rdx), %ecx
864# else
865 movzbl (%rdi, %rdx), %eax
866 movzbl (%rsi, %rdx), %ecx
867# endif
868 /* Check null CHAR. */
869 testl %eax, %eax
870 jne L(cross_page_loop)
871 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
872 comparisons. */
873 subl %ecx, %eax
874# ifndef USE_AS_WCSCMP
875L(different):
876# endif
877 ret
878
879# ifdef USE_AS_WCSCMP
880 .p2align 4
881L(different):
882 /* Use movl to avoid modifying EFLAGS. */
883 movl $0, %eax
884 setl %al
885 negl %eax
886 orl $1, %eax
887 ret
888# endif
889
890# ifdef USE_AS_STRNCMP
891 .p2align 4
892L(zero):
893 xorl %eax, %eax
894 ret
895
896 .p2align 4
897L(char0):
898# ifdef USE_AS_WCSCMP
899 xorl %eax, %eax
900 movl (%rdi), %ecx
901 cmpl (%rsi), %ecx
902 jne L(wcscmp_return)
903# else
904 movzbl (%rsi), %ecx
905 movzbl (%rdi), %eax
906 subl %ecx, %eax
907# endif
908 ret
909# endif
910
911 .p2align 4
912L(last_vector):
913 addq %rdx, %rdi
914 addq %rdx, %rsi
915# ifdef USE_AS_STRNCMP
916 subq %rdx, %r11
917# endif
918 tzcntl %ecx, %edx
919# ifdef USE_AS_WCSCMP
920 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
921 sall $2, %edx
922# endif
923# ifdef USE_AS_STRNCMP
924 cmpq %r11, %rdx
925 jae L(zero)
926# endif
927# ifdef USE_AS_WCSCMP
928 xorl %eax, %eax
929 movl (%rdi, %rdx), %ecx
930 cmpl (%rsi, %rdx), %ecx
931 jne L(wcscmp_return)
932# else
933 movzbl (%rdi, %rdx), %eax
934 movzbl (%rsi, %rdx), %edx
935 subl %edx, %eax
936# endif
937 ret
938
939 /* Comparing on page boundary region requires special treatment:
940 It must done one vector at the time, starting with the wider
941 ymm vector if possible, if not, with xmm. If fetching 16 bytes
942 (xmm) still passes the boundary, byte comparison must be done.
943 */
944 .p2align 4
945L(cross_page):
946 /* Try one ymm vector at a time. */
947 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
948 jg L(cross_page_1_vector)
949L(loop_1_vector):
950 VMOVU (%rdi, %rdx), %YMM0
951
952 VPTESTM %YMM0, %YMM0, %k2
953 /* Each bit cleared in K1 represents a mismatch or a null CHAR
954 in YMM0 and 32 bytes at (%rsi, %rdx). */
955 VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
956 kmovd %k1, %ecx
957# ifdef USE_AS_WCSCMP
958 subl $0xff, %ecx
959# else
960 incl %ecx
961# endif
962 jne L(last_vector)
963
964 addl $VEC_SIZE, %edx
965
966 addl $VEC_SIZE, %eax
967# ifdef USE_AS_STRNCMP
968 /* Return 0 if the current offset (%rdx) >= the maximum offset
969 (%r11). */
970 cmpq %r11, %rdx
971 jae L(zero)
972# endif
973 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
974 jle L(loop_1_vector)
975L(cross_page_1_vector):
976 /* Less than 32 bytes to check, try one xmm vector. */
977 cmpl $(PAGE_SIZE - 16), %eax
978 jg L(cross_page_1_xmm)
979 VMOVU (%rdi, %rdx), %XMM0
980
981 VPTESTM %YMM0, %YMM0, %k2
982 /* Each bit cleared in K1 represents a mismatch or a null CHAR
983 in XMM0 and 16 bytes at (%rsi, %rdx). */
984 VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2}
985 kmovd %k1, %ecx
986# ifdef USE_AS_WCSCMP
987 subl $0xf, %ecx
988# else
989 subl $0xffff, %ecx
990# endif
991 jne L(last_vector)
992
993 addl $16, %edx
994# ifndef USE_AS_WCSCMP
995 addl $16, %eax
996# endif
997# ifdef USE_AS_STRNCMP
998 /* Return 0 if the current offset (%rdx) >= the maximum offset
999 (%r11). */
1000 cmpq %r11, %rdx
1001 jae L(zero)
1002# endif
1003
1004L(cross_page_1_xmm):
1005# ifndef USE_AS_WCSCMP
1006 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
1007 for wcscmp nor wcsncmp since wide char is 4 bytes. */
1008 cmpl $(PAGE_SIZE - 8), %eax
1009 jg L(cross_page_8bytes)
1010 vmovq (%rdi, %rdx), %XMM0
1011 vmovq (%rsi, %rdx), %XMM1
1012
1013 VPTESTM %YMM0, %YMM0, %k2
1014 /* Each bit cleared in K1 represents a mismatch or a null CHAR
1015 in XMM0 and XMM1. */
1016 VPCMP $0, %XMM1, %XMM0, %k1{%k2}
1017 kmovb %k1, %ecx
1018# ifdef USE_AS_WCSCMP
1019 subl $0x3, %ecx
1020# else
1021 subl $0xff, %ecx
1022# endif
1023 jne L(last_vector)
1024
1025 addl $8, %edx
1026 addl $8, %eax
1027# ifdef USE_AS_STRNCMP
1028 /* Return 0 if the current offset (%rdx) >= the maximum offset
1029 (%r11). */
1030 cmpq %r11, %rdx
1031 jae L(zero)
1032# endif
1033
1034L(cross_page_8bytes):
1035 /* Less than 8 bytes to check, try 4 byte vector. */
1036 cmpl $(PAGE_SIZE - 4), %eax
1037 jg L(cross_page_4bytes)
1038 vmovd (%rdi, %rdx), %XMM0
1039 vmovd (%rsi, %rdx), %XMM1
1040
1041 VPTESTM %YMM0, %YMM0, %k2
1042 /* Each bit cleared in K1 represents a mismatch or a null CHAR
1043 in XMM0 and XMM1. */
1044 VPCMP $0, %XMM1, %XMM0, %k1{%k2}
1045 kmovd %k1, %ecx
1046# ifdef USE_AS_WCSCMP
1047 subl $0x1, %ecx
1048# else
1049 subl $0xf, %ecx
1050# endif
1051 jne L(last_vector)
1052
1053 addl $4, %edx
1054# ifdef USE_AS_STRNCMP
1055 /* Return 0 if the current offset (%rdx) >= the maximum offset
1056 (%r11). */
1057 cmpq %r11, %rdx
1058 jae L(zero)
1059# endif
1060
1061L(cross_page_4bytes):
1062# endif
1063 /* Less than 4 bytes to check, try one byte/dword at a time. */
1064# ifdef USE_AS_STRNCMP
1065 cmpq %r11, %rdx
1066 jae L(zero)
1067# endif
1068# ifdef USE_AS_WCSCMP
1069 movl (%rdi, %rdx), %eax
1070 movl (%rsi, %rdx), %ecx
1071# else
1072 movzbl (%rdi, %rdx), %eax
1073 movzbl (%rsi, %rdx), %ecx
1074# endif
1075 testl %eax, %eax
1076 jne L(cross_page_loop)
1077 subl %ecx, %eax
1078 ret
1079END (STRCMP)
1080#endif
1081