strcmp-evex.S source code [glibc/sysdeps/x86_64/multiarch/strcmp-evex.S]

1	/ strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2021-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	# ifndef VEC_SIZE
24	# include "x86-evex256-vecs.h"
25	# endif
26
27	# define STRCMP_ISA _evex
28	# include "strcmp-naming.h"
29
30	# include <sysdep.h>
31	# if defined USE_AS_STRCASECMP_L
32	# include "locale-defines.h"
33	# endif
34
35	# ifndef STRCMP
36	# define STRCMP __strcmp_evex
37	# endif
38
39	# define PAGE_SIZE 4096
40
41	/ VEC_SIZE = Number of bytes in a ymm register. /
42	# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
43
44	# ifdef USE_AS_WCSCMP
45	/ Compare packed dwords. /
46	# define VPCMP vpcmpd
47	# define VPCMPEQ vpcmpeqd
48	# define VPMINU vpminud
49	# define VPTESTM vptestmd
50	# define VPTESTNM vptestnmd
51	/ 1 dword char == 4 bytes. /
52	# define SIZE_OF_CHAR 4
53
54	# define TESTEQ sub $((1 << CHAR_PER_VEC) - 1),
55
56	# define USE_WIDE_CHAR
57	# else
58	/ Compare packed bytes. /
59	# define VPCMP vpcmpb
60	# define VPCMPEQ vpcmpeqb
61	# define VPMINU vpminub
62	# define VPTESTM vptestmb
63	# define VPTESTNM vptestnmb
64	/ 1 byte char == 1 byte. /
65	# define SIZE_OF_CHAR 1
66
67	# define TESTEQ inc
68	# endif
69
70	# include "reg-macros.h"
71
72	# if VEC_SIZE == 64
73	# define RODATA_SECTION rodata.cst64
74	# else
75	# define RODATA_SECTION rodata.cst32
76	# endif
77
78	# if CHAR_PER_VEC == 64
79	# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 3)
80	# else
81	# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 2)
82	# endif
83
84	# ifdef USE_AS_STRNCMP
85	# define LOOP_REG VR9
86	# define LOOP_REG64 r9
87
88	# define OFFSET_REG8 r9b
89	# define OFFSET_REG r9d
90	# define OFFSET_REG64 r9
91	# else
92	# define LOOP_REG VRDX
93	# define LOOP_REG64 rdx
94
95	# define OFFSET_REG8 dl
96	# define OFFSET_REG edx
97	# define OFFSET_REG64 rdx
98	# endif
99
100	# if defined USE_AS_STRNCMP \|\| defined USE_AS_WCSCMP
101	# define VEC_OFFSET 0
102	# else
103	# define VEC_OFFSET (-VEC_SIZE)
104	# endif
105
106	# ifdef USE_AS_STRCASECMP_L
107	# define BYTE_LOOP_REG OFFSET_REG
108	# else
109	# define BYTE_LOOP_REG ecx
110	# endif
111
112	# ifdef USE_AS_STRCASECMP_L
113	# ifdef USE_AS_STRNCMP
114	# define LOCALE_REG rcx
115	# define LOCALE_REG_LP RCX_LP
116	# else
117	# define LOCALE_REG rdx
118	# define LOCALE_REG_LP RDX_LP
119	# endif
120	# endif
121
122	# define LCASE_MIN_V VMM(12)
123	# define LCASE_MAX_V VMM(13)
124	# define CASE_ADD_V VMM(14)
125
126	# if VEC_SIZE == 64
127	# define LCASE_MIN_YMM VMM_256(12)
128	# define LCASE_MAX_YMM VMM_256(13)
129	# define CASE_ADD_YMM VMM_256(14)
130	# endif
131
132	# define LCASE_MIN_XMM VMM_128(12)
133	# define LCASE_MAX_XMM VMM_128(13)
134	# define CASE_ADD_XMM VMM_128(14)
135
136	/ NB: wcsncmp uses r11 but strcasecmp is never used in*
137	conjunction with wcscmp. /*
138	# define TOLOWER_BASE %r11
139
140	# ifdef USE_AS_STRCASECMP_L
141	# define _REG(x, y) x ## y
142	# define REG(x, y) _REG(x, y)
143	# define TOLOWER(reg1, reg2, ext, vec_macro) \
144	vpsubb %REG(LCASE_MIN_, ext), reg1, %vec_macro(10); \
145	vpsubb %REG(LCASE_MIN_, ext), reg2, %vec_macro(11); \
146	vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \
147	vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \
148	vpaddb reg1, %REG(CASE_ADD_, ext), reg1{%k5}; \
149	vpaddb reg2, %REG(CASE_ADD_, ext), reg2{%k6}
150
151	# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
152	# define TOLOWER_VMM(...) TOLOWER(__VA_ARGS__, V, VMM)
153	# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM, VMM_256)
154	# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM, VMM_128)
155
156	# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro) \
157	TOLOWER (s1_reg, s2_reg, ext, vec_macro); \
158	VPCMPEQ s1_reg, s2_reg, reg_out
159
160	# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro) \
161	VMOVU s2_mem, s2_reg; \
162	CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
163
164	# define CMP_R1_R2_VMM(...) CMP_R1_R2(__VA_ARGS__, V, VMM)
165	# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
166	# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
167
168	# define CMP_R1_S2_VMM(...) CMP_R1_S2(__VA_ARGS__, V, VMM)
169	# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
170	# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
171
172	# else
173	# define TOLOWER_gpr(...)
174	# define TOLOWER_VMM(...)
175	# define TOLOWER_YMM(...)
176	# define TOLOWER_XMM(...)
177
178	# define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out) \
179	VPCMPEQ s2_reg, s1_reg, reg_out
180
181	# define CMP_R1_R2_YMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
182	# define CMP_R1_R2_XMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
183
184	# define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out) \
185	VPCMPEQ s2_mem, s1_reg, reg_out
186	# define CMP_R1_S2_YMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
187	# define CMP_R1_S2_XMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
188	# endif
189
190	/ Warning!*
191	wcscmp/wcsncmp have to use SIGNED comparison for elements.
192	strcmp/strncmp have to use UNSIGNED comparison for elements.
193	*/
194
195	/ The main idea of the string comparison (byte or dword) using 256-bit*
196	EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
197	latter can be on either packed bytes or dwords depending on
198	USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
199	matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
200	KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
201	are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
202	instructions. Main loop (away from from page boundary) compares 4
203	vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
204	bytes) on each loop.
205
206	The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
207	is the same as strcmp, except that an a maximum offset is tracked. If
208	the maximum offset is reached before a difference is found, zero is
209	returned. /*
210
211	.section SECTION(.text), "ax", @progbits
212	.align `16`
213	.type STRCMP, @function
214	.globl STRCMP
215	# ifdef USE_AS_STRCASECMP_L
216	ENTRY (STRCASECMP)
217	movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
218	mov %fs:(%rax), %LOCALE_REG_LP
219
220	/ Either 1 or 5 bytes (depending if CET is enabled). /
221	.p2align `4`
222	END (STRCASECMP)
223	/ FALLTHROUGH to strcasecmp/strncasecmp_l. /
224	# endif
225
226	.p2align `4`
227	STRCMP:
228	cfi_startproc
229	_CET_ENDBR
230	CALL_MCOUNT
231
232	# if defined USE_AS_STRCASECMP_L
233	/ We have to fall back on the C implementation for locales with*
234	encodings not matching ASCII for single bytes. /*
235	# if LOCALE_T___LOCALES != 0 \|\| LC_CTYPE != 0
236	mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
237	# else
238	mov (%LOCALE_REG), %RAX_LP
239	# endif
240	testb $`1`, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
241	jne STRCASECMP_L_NONASCII
242	leaq _nl_C_LC_CTYPE_tolower + `128` * `4`(%rip), TOLOWER_BASE
243	# endif
244
245	# ifdef USE_AS_STRNCMP
246	/ Don't overwrite LOCALE_REG (rcx) until we have pass*
247	L(one_or_less). Otherwise we might use the wrong locale in
248	the OVERFLOW_STRCMP (strcasecmp_l). /*
249	# ifdef __ILP32__
250	/ Clear the upper 32 bits. /
251	movl %edx, %edx
252	# endif
253	cmp $`1`, %RDX_LP
254	/ Signed comparison intentional. We use this branch to also*
255	test cases where length >= 2^63. These very large sizes can be
256	handled with strcmp as there is no way for that length to
257	actually bound the buffer. /*
258	jle L(one_or_less)
259	# endif
260
261	# if defined USE_AS_STRCASECMP_L
262	.section RODATA_SECTION, "aM", @progbits, VEC_SIZE
263	.align VEC_SIZE
264	L(lcase_min):
265	.quad `0x4141414141414141`
266	.quad `0x4141414141414141`
267	.quad `0x4141414141414141`
268	.quad `0x4141414141414141`
269	# if VEC_SIZE == 64
270	.quad `0x4141414141414141`
271	.quad `0x4141414141414141`
272	.quad `0x4141414141414141`
273	.quad `0x4141414141414141`
274	# endif
275	L(lcase_max):
276	.quad `0x1a1a1a1a1a1a1a1a`
277	.quad `0x1a1a1a1a1a1a1a1a`
278	.quad `0x1a1a1a1a1a1a1a1a`
279	.quad `0x1a1a1a1a1a1a1a1a`
280	# if VEC_SIZE == 64
281	.quad `0x1a1a1a1a1a1a1a1a`
282	.quad `0x1a1a1a1a1a1a1a1a`
283	.quad `0x1a1a1a1a1a1a1a1a`
284	.quad `0x1a1a1a1a1a1a1a1a`
285	# endif
286	L(case_add):
287	.quad `0x2020202020202020`
288	.quad `0x2020202020202020`
289	.quad `0x2020202020202020`
290	.quad `0x2020202020202020`
291	# if VEC_SIZE == 64
292	.quad `0x2020202020202020`
293	.quad `0x2020202020202020`
294	.quad `0x2020202020202020`
295	.quad `0x2020202020202020`
296	# endif
297	.previous
298
299	VMOVA L(lcase_min)(%rip), %LCASE_MIN_V
300	VMOVA L(lcase_max)(%rip), %LCASE_MAX_V
301	VMOVA L(case_add)(%rip), %CASE_ADD_V
302	# endif
303
304	movl %edi, %eax
305	orl %esi, %eax
306	/ Shift out the bits irrelivant to page boundary ([63:12]). /
307	sall $`20`, %eax
308	/ Check if s1 or s2 may cross a page in next 4x VEC loads. /
309	cmpl $((PAGE_SIZE -(VEC_SIZE * `4`)) << `20`), %eax
310	ja L(page_cross)
311
312	L(no_page_cross):
313	/ Safe to compare 4x vectors. /
314	VMOVU (%rdi), %VMM(`0`)
315	VPTESTM %VMM(`0`), %VMM(`0`), %k2
316	/ Each bit cleared in K1 represents a mismatch or a null CHAR*
317	in YMM0 and 32 bytes at (%rsi). /*
318	CMP_R1_S2_VMM (%VMM(`0`), (%rsi), %VMM(`1`), %k1){%k2}
319	KMOV %k1, %VRCX
320	# ifdef USE_AS_STRNCMP
321	cmpq $CHAR_PER_VEC, %rdx
322	jbe L(vec_0_test_len)
323	# endif
324
325	/ TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for*
326	wcscmp/wcsncmp. /*
327
328	/ All 1s represents all equals. TESTEQ will overflow to zero in*
329	all equals case. Otherwise 1s will carry until position of
330	first mismatch. /*
331	TESTEQ %VRCX
332	jz L(more_3x_vec)
333
334	.p2align `4`,, `4`
335	L(return_vec_0):
336	bsf %VRCX, %VRCX
337	# ifdef USE_AS_WCSCMP
338	movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
339	xorl %eax, %eax
340	cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
341	je L(ret0)
342	setl %al
343	negl %eax
344	orl $`1`, %eax
345	# else
346	movzbl (%rdi, %rcx), %eax
347	/ For VEC_SIZE == 64 use movb instead of movzbl to save a byte*
348	and keep logic for len <= VEC_SIZE (common) in just the
349	first cache line. NB: No evex512 processor has partial-
350	register stalls. If that changes this ifdef can be disabled
351	without affecting correctness. /*
352	# if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
353	movb (%rsi, %rcx), %cl
354	# else
355	movzbl (%rsi, %rcx), %ecx
356	# endif
357	TOLOWER_gpr (%rax, %eax)
358	TOLOWER_gpr (%rcx, %ecx)
359	subl %ecx, %eax
360	# endif
361	L(ret0):
362	ret
363
364	# ifdef USE_AS_STRNCMP
365	.p2align `4`,, `4`
366	L(vec_0_test_len):
367	not %VRCX
368	bzhi %VRDX, %VRCX, %VRAX
369	jnz L(return_vec_0)
370	/ Align if will cross fetch block. /
371	.p2align `4`,, `2`
372	L(ret_zero):
373	xorl %eax, %eax
374	ret
375
376	.p2align `4`,, `5`
377	L(one_or_less):
378	# ifdef USE_AS_STRCASECMP_L
379	/ Set locale argument for strcasecmp. /
380	movq %LOCALE_REG, %rdx
381	# endif
382	jb L(ret_zero)
383	/ 'nbe' covers the case where length is negative (large*
384	unsigned). /*
385	jnbe OVERFLOW_STRCMP
386	# ifdef USE_AS_WCSCMP
387	movl (%rdi), %edx
388	xorl %eax, %eax
389	cmpl (%rsi), %edx
390	je L(ret1)
391	setl %al
392	negl %eax
393	orl $`1`, %eax
394	# else
395	movzbl (%rdi), %eax
396	movzbl (%rsi), %ecx
397	TOLOWER_gpr (%rax, %eax)
398	TOLOWER_gpr (%rcx, %ecx)
399	subl %ecx, %eax
400	# endif
401	L(ret1):
402	ret
403	# endif
404
405	.p2align `4`,, `10`
406	L(return_vec_1):
407	bsf %VRCX, %VRCX
408	# ifdef USE_AS_STRNCMP
409	/ rdx must be > CHAR_PER_VEC so its safe to subtract without*
410	worrying about underflow. /*
411	addq $-CHAR_PER_VEC, %rdx
412	cmpq %rcx, %rdx
413	jbe L(ret_zero)
414	# endif
415	# ifdef USE_AS_WCSCMP
416	movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
417	xorl %eax, %eax
418	cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
419	je L(ret2)
420	setl %al
421	negl %eax
422	orl $`1`, %eax
423	# else
424	movzbl VEC_SIZE(%rdi, %rcx), %eax
425	movzbl VEC_SIZE(%rsi, %rcx), %ecx
426	TOLOWER_gpr (%rax, %eax)
427	TOLOWER_gpr (%rcx, %ecx)
428	subl %ecx, %eax
429	# endif
430	L(ret2):
431	ret
432
433	.p2align `4`,, `10`
434	# ifdef USE_AS_STRNCMP
435	L(return_vec_3):
436	# if CHAR_PER_VEC <= 32
437	/ If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without*
438	additional branches by adjusting the bit positions from
439	VEC3. We can't do this for CHAR_PER_VEC == 64. /*
440	# if CHAR_PER_VEC <= 16
441	sall $CHAR_PER_VEC, %ecx
442	# else
443	salq $CHAR_PER_VEC, %rcx
444	# endif
445	# else
446	/ If CHAR_PER_VEC == 64 we can't shift the return GPR so just*
447	check it. /*
448	bsf %VRCX, %VRCX
449	addl $(CHAR_PER_VEC), %ecx
450	cmpq %rcx, %rdx
451	ja L(ret_vec_3_finish)
452	xorl %eax, %eax
453	ret
454	# endif
455	# endif
456
457	/ If CHAR_PER_VEC == 64 we can't combine matches from the last*
458	2x VEC so need separate return label. /*
459	L(return_vec_2):
460	# if (CHAR_PER_VEC <= 16) \|\| !(defined USE_AS_STRNCMP)
461	bsf %VRCX, %VRCX
462	# else
463	bsfq %rcx, %rcx
464	# endif
465	# ifdef USE_AS_STRNCMP
466	cmpq %rcx, %rdx
467	jbe L(ret_zero)
468	# endif
469
470	L(ret_vec_3_finish):
471	# ifdef USE_AS_WCSCMP
472	movl (VEC_SIZE * `2`)(%rdi, %rcx, SIZE_OF_CHAR), %edx
473	xorl %eax, %eax
474	cmpl (VEC_SIZE * `2`)(%rsi, %rcx, SIZE_OF_CHAR), %edx
475	je L(ret3)
476	setl %al
477	negl %eax
478	orl $`1`, %eax
479	# else
480	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
481	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %ecx
482	TOLOWER_gpr (%rax, %eax)
483	TOLOWER_gpr (%rcx, %ecx)
484	subl %ecx, %eax
485	# endif
486	L(ret3):
487	ret
488
489	# ifndef USE_AS_STRNCMP
490	.p2align `4`,, `10`
491	L(return_vec_3):
492	bsf %VRCX, %VRCX
493	# ifdef USE_AS_WCSCMP
494	movl (VEC_SIZE * `3`)(%rdi, %rcx, SIZE_OF_CHAR), %edx
495	xorl %eax, %eax
496	cmpl (VEC_SIZE * `3`)(%rsi, %rcx, SIZE_OF_CHAR), %edx
497	je L(ret4)
498	setl %al
499	negl %eax
500	orl $`1`, %eax
501	# else
502	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
503	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %ecx
504	TOLOWER_gpr (%rax, %eax)
505	TOLOWER_gpr (%rcx, %ecx)
506	subl %ecx, %eax
507	# endif
508	L(ret4):
509	ret
510	# endif
511
512	/ 32 byte align here ensures the main loop is ideally aligned*
513	for DSB. /*
514	.p2align `5`
515	L(more_3x_vec):
516	/ Safe to compare 4x vectors. /
517	VMOVU (VEC_SIZE)(%rdi), %VMM(`0`)
518	VPTESTM %VMM(`0`), %VMM(`0`), %k2
519	CMP_R1_S2_VMM (%VMM(`0`), VEC_SIZE(%rsi), %VMM(`1`), %k1){%k2}
520	KMOV %k1, %VRCX
521	TESTEQ %VRCX
522	jnz L(return_vec_1)
523
524	# ifdef USE_AS_STRNCMP
525	subq $(CHAR_PER_VEC * `2`), %rdx
526	jbe L(ret_zero)
527	# endif
528
529	VMOVU (VEC_SIZE * `2`)(%rdi), %VMM(`0`)
530	VPTESTM %VMM(`0`), %VMM(`0`), %k2
531	CMP_R1_S2_VMM (%VMM(`0`), (VEC_SIZE * `2`)(%rsi), %VMM(`1`), %k1){%k2}
532	KMOV %k1, %VRCX
533	TESTEQ %VRCX
534	jnz L(return_vec_2)
535
536	VMOVU (VEC_SIZE * `3`)(%rdi), %VMM(`0`)
537	VPTESTM %VMM(`0`), %VMM(`0`), %k2
538	CMP_R1_S2_VMM (%VMM(`0`), (VEC_SIZE * `3`)(%rsi), %VMM(`1`), %k1){%k2}
539	KMOV %k1, %VRCX
540	TESTEQ %VRCX
541	jnz L(return_vec_3)
542
543	# ifdef USE_AS_STRNCMP
544	cmpq $(CHAR_PER_VEC * `2`), %rdx
545	jbe L(ret_zero)
546	# endif
547
548
549	# ifdef USE_AS_WCSCMP
550	/ any non-zero positive value that doesn't inference with 0x1.*
551	*/
552	movl $`2`, %r8d
553
554	# else
555	xorl %r8d, %r8d
556	# endif
557
558	/ The prepare labels are various entry points from the page*
559	cross logic. /*
560	L(prepare_loop):
561
562	# ifdef USE_AS_STRNCMP
563	# ifdef USE_AS_WCSCMP
564	L(prepare_loop_no_len):
565	movl %edi, %ecx
566	andl $(VEC_SIZE * `4` - `1`), %ecx
567	shrl $`2`, %ecx
568	leaq (CHAR_PER_VEC * `2`)(%rdx, %rcx), %rdx
569	# else
570	/ Store N + (VEC_SIZE * 4) and place check at the beginning of*
571	the loop. /*
572	leaq (VEC_SIZE * `2`)(%rdi, %rdx), %rdx
573	L(prepare_loop_no_len):
574	# endif
575	# else
576	L(prepare_loop_no_len):
577	# endif
578
579	/ Align s1 and adjust s2 accordingly. /
580	subq %rdi, %rsi
581	andq $-(VEC_SIZE * `4`), %rdi
582	L(prepare_loop_readj):
583	addq %rdi, %rsi
584	# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
585	subq %rdi, %rdx
586	# endif
587
588	L(prepare_loop_aligned):
589	/ eax stores distance from rsi to next page cross. These cases*
590	need to be handled specially as the 4x loop could potentially
591	read memory past the length of s1 or s2 and across a page
592	boundary. /*
593	movl $-(VEC_SIZE * `4`), %eax
594	subl %esi, %eax
595	andl $(PAGE_SIZE - `1`), %eax
596
597
598	/ Loop 4x comparisons at a time. /
599	.p2align `4`
600	L(loop):
601
602	/ End condition for strncmp. /
603	# ifdef USE_AS_STRNCMP
604	subq $(CHAR_PER_VEC * `4`), %rdx
605	jbe L(ret_zero)
606	# endif
607
608	subq $-(VEC_SIZE * `4`), %rdi
609	subq $-(VEC_SIZE * `4`), %rsi
610
611	/ Check if rsi loads will cross a page boundary. /
612	addl $-(VEC_SIZE * `4`), %eax
613	jnb L(page_cross_during_loop)
614
615	/ Loop entry after handling page cross during loop. /
616	L(loop_skip_page_cross_check):
617	VMOVA (VEC_SIZE * `0`)(%rdi), %VMM(`0`)
618	VMOVA (VEC_SIZE * `1`)(%rdi), %VMM(`2`)
619	VMOVA (VEC_SIZE * `2`)(%rdi), %VMM(`4`)
620	VMOVA (VEC_SIZE * `3`)(%rdi), %VMM(`6`)
621
622	VPMINU %VMM(`0`), %VMM(`2`), %VMM(`8`)
623	VPMINU %VMM(`4`), %VMM(`6`), %VMM(`9`)
624
625	/ A zero CHAR in YMM9 means that there is a null CHAR. /
626	VPMINU %VMM(`8`), %VMM(`9`), %VMM(`9`)
627
628	/ Each bit set in K1 represents a non-null CHAR in YMM9. /
629	VPTESTM %VMM(`9`), %VMM(`9`), %k1
630	# ifndef USE_AS_STRCASECMP_L
631	vpxorq (VEC_SIZE * `0`)(%rsi), %VMM(`0`), %VMM(`1`)
632	vpxorq (VEC_SIZE * `1`)(%rsi), %VMM(`2`), %VMM(`3`)
633	vpxorq (VEC_SIZE * `2`)(%rsi), %VMM(`4`), %VMM(`5`)
634	/ Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while*
635	oring with YMM1. Result is stored in YMM6. /*
636	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rsi), %VMM(`1`), %VMM(`6`)
637	# else
638	VMOVU (VEC_SIZE * `0`)(%rsi), %VMM(`1`)
639	TOLOWER_VMM (%VMM(`0`), %VMM(`1`))
640	VMOVU (VEC_SIZE * `1`)(%rsi), %VMM(`3`)
641	TOLOWER_VMM (%VMM(`2`), %VMM(`3`))
642	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`5`)
643	TOLOWER_VMM (%VMM(`4`), %VMM(`5`))
644	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`7`)
645	TOLOWER_VMM (%VMM(`6`), %VMM(`7`))
646	vpxorq %VMM(`0`), %VMM(`1`), %VMM(`1`)
647	vpxorq %VMM(`2`), %VMM(`3`), %VMM(`3`)
648	vpxorq %VMM(`4`), %VMM(`5`), %VMM(`5`)
649	vpternlogd $`0xde`, %VMM(`7`), %VMM(`1`), %VMM(`6`)
650	# endif
651	/ Or together YMM3, YMM5, and YMM6. /
652	vpternlogd $`0xfe`, %VMM(`3`), %VMM(`5`), %VMM(`6`)
653
654
655	/ A non-zero CHAR in YMM6 represents a mismatch. /
656	VPTESTNM %VMM(`6`), %VMM(`6`), %k0{%k1}
657	KMOV %k0, %LOOP_REG
658
659	TESTEQ %LOOP_REG
660	jz L(loop)
661
662
663	/ Find which VEC has the mismatch of end of string. /
664	VPTESTM %VMM(`0`), %VMM(`0`), %k1
665	VPTESTNM %VMM(`1`), %VMM(`1`), %k0{%k1}
666	KMOV %k0, %VRCX
667	TESTEQ %VRCX
668	jnz L(return_vec_0_end)
669
670	VPTESTM %VMM(`2`), %VMM(`2`), %k1
671	VPTESTNM %VMM(`3`), %VMM(`3`), %k0{%k1}
672	KMOV %k0, %VRCX
673	TESTEQ %VRCX
674	jnz L(return_vec_1_end)
675
676
677	/ Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.*
678	*/
679	L(return_vec_2_3_end):
680	# ifdef USE_AS_STRNCMP
681	subq $(CHAR_PER_VEC * `2`), %rdx
682	jbe L(ret_zero_end)
683	# endif
684
685	VPTESTM %VMM(`4`), %VMM(`4`), %k1
686	VPTESTNM %VMM(`5`), %VMM(`5`), %k0{%k1}
687	KMOV %k0, %VRCX
688	TESTEQ %VRCX
689	# if CHAR_PER_VEC <= 16
690	sall $CHAR_PER_VEC, %LOOP_REG
691	orl %ecx, %LOOP_REG
692	# elif CHAR_PER_VEC <= 32
693	salq $CHAR_PER_VEC, %LOOP_REG64
694	orq %rcx, %LOOP_REG64
695	# else
696	/ We aren't combining last 2x VEC so branch on second the last.*
697	*/
698	jnz L(return_vec_2_end)
699	# endif
700
701	/ LOOP_REG contains matches for null/mismatch from the loop. If*
702	VEC 0,1,and 2 all have no null and no mismatches then
703	mismatch must entirely be from VEC 3 which is fully
704	represented by LOOP_REG. /*
705	# if CHAR_PER_VEC <= 16
706	bsf %LOOP_REG, %LOOP_REG
707	# else
708	bsfq %LOOP_REG64, %LOOP_REG64
709	# endif
710	# ifdef USE_AS_STRNCMP
711
712	/ If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to*
713	adj length before last comparison. /*
714	# if CHAR_PER_VEC == 64
715	subq $CHAR_PER_VEC, %rdx
716	jbe L(ret_zero_end)
717	# endif
718
719	cmpq %LOOP_REG64, %rdx
720	jbe L(ret_zero_end)
721	# endif
722
723	# ifdef USE_AS_WCSCMP
724	movl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
725	xorl %eax, %eax
726	cmpl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
727	je L(ret5)
728	setl %al
729	negl %eax
730	xorl %r8d, %eax
731	# else
732	movzbl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
733	movzbl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
734	TOLOWER_gpr (%rax, %eax)
735	TOLOWER_gpr (%rcx, %ecx)
736	subl %ecx, %eax
737	xorl %r8d, %eax
738	subl %r8d, %eax
739	# endif
740	L(ret5):
741	ret
742
743	# ifdef USE_AS_STRNCMP
744	.p2align `4`,, `2`
745	L(ret_zero_end):
746	xorl %eax, %eax
747	ret
748	# endif
749
750
751
752	/ The L(return_vec_N_end) differ from L(return_vec_N) in that*
753	they use the value of `r8` to negate the return value. This
754	is because the page cross logic can swap `rdi` and `rsi`.
755	*/
756	.p2align `4`,, `10`
757	# ifdef USE_AS_STRNCMP
758	L(return_vec_1_end):
759	# if CHAR_PER_VEC <= 32
760	/ If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)*
761	without additional branches by adjusting the bit positions
762	from VEC1. We can't do this for CHAR_PER_VEC == 64. /*
763	# if CHAR_PER_VEC <= 16
764	sall $CHAR_PER_VEC, %ecx
765	# else
766	salq $CHAR_PER_VEC, %rcx
767	# endif
768	# else
769	/ If CHAR_PER_VEC == 64 we can't shift the return GPR so just*
770	check it. /*
771	bsf %VRCX, %VRCX
772	addl $(CHAR_PER_VEC), %ecx
773	cmpq %rcx, %rdx
774	ja L(ret_vec_0_end_finish)
775	xorl %eax, %eax
776	ret
777	# endif
778	# endif
779	L(return_vec_0_end):
780	# if (CHAR_PER_VEC <= 16) \|\| !(defined USE_AS_STRNCMP)
781	bsf %VRCX, %VRCX
782	# else
783	bsfq %rcx, %rcx
784	# endif
785
786	# ifdef USE_AS_STRNCMP
787	cmpq %rcx, %rdx
788	jbe L(ret_zero_end)
789	# endif
790
791	L(ret_vec_0_end_finish):
792	# ifdef USE_AS_WCSCMP
793	movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
794	xorl %eax, %eax
795	cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
796	je L(ret6)
797	setl %al
798	negl %eax
799	/ This is the non-zero case for `eax` so just xorl with `r8d`*
800	flip is `rdi` and `rsi` where swapped. /*
801	xorl %r8d, %eax
802	# else
803	movzbl (%rdi, %rcx), %eax
804	movzbl (%rsi, %rcx), %ecx
805	TOLOWER_gpr (%rax, %eax)
806	TOLOWER_gpr (%rcx, %ecx)
807	subl %ecx, %eax
808	/ Flip `eax` if `rdi` and `rsi` where swapped in page cross*
809	logic. Subtract `r8d` after xor for zero case. /*
810	xorl %r8d, %eax
811	subl %r8d, %eax
812	# endif
813	L(ret6):
814	ret
815
816	# ifndef USE_AS_STRNCMP
817	.p2align `4`,, `10`
818	L(return_vec_1_end):
819	bsf %VRCX, %VRCX
820	# ifdef USE_AS_WCSCMP
821	movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
822	xorl %eax, %eax
823	cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
824	je L(ret7)
825	setl %al
826	negl %eax
827	xorl %r8d, %eax
828	# else
829	movzbl VEC_SIZE(%rdi, %rcx), %eax
830	movzbl VEC_SIZE(%rsi, %rcx), %ecx
831	TOLOWER_gpr (%rax, %eax)
832	TOLOWER_gpr (%rcx, %ecx)
833	subl %ecx, %eax
834	xorl %r8d, %eax
835	subl %r8d, %eax
836	# endif
837	L(ret7):
838	ret
839	# endif
840
841
842	/ If CHAR_PER_VEC == 64 we can't combine matches from the last*
843	2x VEC so need separate return label. /*
844	# if CHAR_PER_VEC == 64
845	L(return_vec_2_end):
846	bsf %VRCX, %VRCX
847	# ifdef USE_AS_STRNCMP
848	cmpq %rcx, %rdx
849	jbe L(ret_zero_end)
850	# endif
851	# ifdef USE_AS_WCSCMP
852	movl (VEC_SIZE * `2`)(%rdi, %rcx, SIZE_OF_CHAR), %edx
853	xorl %eax, %eax
854	cmpl (VEC_SIZE * `2`)(%rsi, %rcx, SIZE_OF_CHAR), %edx
855	je L(ret31)
856	setl %al
857	negl %eax
858	/ This is the non-zero case for `eax` so just xorl with `r8d`*
859	flip is `rdi` and `rsi` where swapped. /*
860	xorl %r8d, %eax
861	# else
862	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
863	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %ecx
864	TOLOWER_gpr (%rax, %eax)
865	TOLOWER_gpr (%rcx, %ecx)
866	subl %ecx, %eax
867	/ Flip `eax` if `rdi` and `rsi` where swapped in page cross*
868	logic. Subtract `r8d` after xor for zero case. /*
869	xorl %r8d, %eax
870	subl %r8d, %eax
871	# endif
872	L(ret13):
873	ret
874	# endif
875
876
877	/ Page cross in rsi in next 4x VEC. /
878
879	/ TODO: Improve logic here. /
880	.p2align `4`,, `10`
881	L(page_cross_during_loop):
882	/ eax contains [distance_from_page - (VEC_SIZE * 4)]. /
883
884	/ Optimistically rsi and rdi and both aligned in which case we*
885	don't need any logic here. /*
886	cmpl $-(VEC_SIZE * `4`), %eax
887	/ Don't adjust eax before jumping back to loop and we will*
888	never hit page cross case again. /*
889	je L(loop_skip_page_cross_check)
890
891	/ Check if we can safely load a VEC. /
892	cmpl $-(VEC_SIZE * `3`), %eax
893	jle L(less_1x_vec_till_page_cross)
894
895	VMOVA (%rdi), %VMM(`0`)
896	VPTESTM %VMM(`0`), %VMM(`0`), %k2
897	CMP_R1_S2_VMM (%VMM(`0`), (%rsi), %VMM(`1`), %k1){%k2}
898	KMOV %k1, %VRCX
899	TESTEQ %VRCX
900	jnz L(return_vec_0_end)
901
902	/ if distance >= 2x VEC then eax > -(VEC_SIZE * 2). /
903	cmpl $-(VEC_SIZE * `2`), %eax
904	jg L(more_2x_vec_till_page_cross)
905
906	.p2align `4`,, `4`
907	L(less_1x_vec_till_page_cross):
908	subl $-(VEC_SIZE * `4`), %eax
909	/ Guaranteed safe to read from rdi - VEC_SIZE here. The only*
910	concerning case is first iteration if incoming s1 was near start
911	of a page and s2 near end. If s1 was near the start of the page
912	we already aligned up to nearest VEC_SIZE 4 so gurnateed safe*
913	to read back -VEC_SIZE. If rdi is truly at the start of a page
914	here, it means the previous page (rdi - VEC_SIZE) has already
915	been loaded earlier so must be valid. /*
916	VMOVU -VEC_SIZE(%rdi, %rax), %VMM(`0`)
917	VPTESTM %VMM(`0`), %VMM(`0`), %k2
918	CMP_R1_S2_VMM (%VMM(`0`), -VEC_SIZE(%rsi, %rax), %VMM(`1`), %k1){%k2}
919	/ Mask of potentially valid bits. The lower bits can be out of*
920	range comparisons (but safe regarding page crosses). /*
921
922	# ifdef USE_AS_WCSCMP
923	movl $-`1`, %r10d
924	movl %esi, %ecx
925	andl $(VEC_SIZE - `1`), %ecx
926	shrl $`2`, %ecx
927	shlxl %ecx, %r10d, %ecx
928	/ Depending on CHAR_PER_VEC extract mask for possible in-bound*
929	matches. /*
930	# if CHAR_PER_VEC == 16
931	movzwl %cx, %r10d
932	# elif CHAR_PER_VEC == 8
933	movzbl %cl, %r10d
934	# else
935	# error "Invalid CHAR_SIZE or VEC_SIZE"
936	# endif
937	# else
938	mov $-`1`, %VRCX
939	shlx %VRSI, %VRCX, %VR10
940	# endif
941
942	KMOV %k1, %VRCX
943	not %VRCX
944
945
946	# ifdef USE_AS_STRNCMP
947	# ifdef USE_AS_WCSCMP
948	/ NB: strcasecmp not used with WCSCMP so this access to r11 is*
949	safe. /*
950	movl %eax, %r11d
951	shrl $`2`, %r11d
952	cmpq %r11, %rdx
953	# else
954	cmpq %rax, %rdx
955	# endif
956	jbe L(return_page_cross_end_check)
957	# endif
958	movl %eax, %OFFSET_REG
959
960	/ Readjust eax before potentially returning to the loop. /
961	addl $(PAGE_SIZE - VEC_SIZE * `4`), %eax
962
963	and %VR10, %VRCX
964	jz L(loop_skip_page_cross_check)
965
966	bsf %VRCX, %VRCX
967
968	# if (defined USE_AS_STRNCMP) \|\| (defined USE_AS_WCSCMP)
969	leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
970	L(return_page_cross_cmp_mem):
971	# else
972	addl %OFFSET_REG, %ecx
973	# endif
974	# ifdef USE_AS_WCSCMP
975	movl VEC_OFFSET(%rdi, %rcx), %edx
976	xorl %eax, %eax
977	cmpl VEC_OFFSET(%rsi, %rcx), %edx
978	je L(ret8)
979	setl %al
980	negl %eax
981	xorl %r8d, %eax
982	# else
983	movzbl VEC_OFFSET(%rdi, %rcx), %eax
984	movzbl VEC_OFFSET(%rsi, %rcx), %ecx
985	TOLOWER_gpr (%rax, %eax)
986	TOLOWER_gpr (%rcx, %ecx)
987	subl %ecx, %eax
988	xorl %r8d, %eax
989	subl %r8d, %eax
990	# endif
991	L(ret8):
992	ret
993
994	# ifdef USE_AS_STRNCMP
995	.p2align `4`,, `10`
996	L(return_page_cross_end_check):
997	and %VR10, %VRCX
998	/ Need to use tzcnt here as VRCX may be zero. If VRCX is zero*
999	tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
1000	guaranteed to be <= CHAR_PER_VEC so we will only use the return
1001	idx if VRCX was non-zero. /*
1002	tzcnt %VRCX, %VRCX
1003	leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1004	# ifdef USE_AS_WCSCMP
1005	sall $`2`, %edx
1006	# endif
1007	cmpl %ecx, %edx
1008	ja L(return_page_cross_cmp_mem)
1009	xorl %eax, %eax
1010	ret
1011	# endif
1012
1013
1014	.p2align `4`,, `10`
1015	L(more_2x_vec_till_page_cross):
1016	/ If more 2x vec till cross we will complete a full loop*
1017	iteration here. /*
1018
1019	VMOVA VEC_SIZE(%rdi), %VMM(`0`)
1020	VPTESTM %VMM(`0`), %VMM(`0`), %k2
1021	CMP_R1_S2_VMM (%VMM(`0`), VEC_SIZE(%rsi), %VMM(`1`), %k1){%k2}
1022	KMOV %k1, %VRCX
1023	TESTEQ %VRCX
1024	jnz L(return_vec_1_end)
1025
1026	# ifdef USE_AS_STRNCMP
1027	cmpq $(CHAR_PER_VEC * `2`), %rdx
1028	jbe L(ret_zero_in_loop_page_cross)
1029	# endif
1030
1031	subl $-(VEC_SIZE * `4`), %eax
1032
1033	/ Safe to include comparisons from lower bytes. /
1034	VMOVU -(VEC_SIZE * `2`)(%rdi, %rax), %VMM(`0`)
1035	VPTESTM %VMM(`0`), %VMM(`0`), %k2
1036	CMP_R1_S2_VMM (%VMM(`0`), -(VEC_SIZE * `2`)(%rsi, %rax), %VMM(`1`), %k1){%k2}
1037	KMOV %k1, %VRCX
1038	TESTEQ %VRCX
1039	jnz L(return_vec_page_cross_0)
1040
1041	VMOVU -(VEC_SIZE * `1`)(%rdi, %rax), %VMM(`0`)
1042	VPTESTM %VMM(`0`), %VMM(`0`), %k2
1043	CMP_R1_S2_VMM (%VMM(`0`), -(VEC_SIZE * `1`)(%rsi, %rax), %VMM(`1`), %k1){%k2}
1044	KMOV %k1, %VRCX
1045	TESTEQ %VRCX
1046	jnz L(return_vec_page_cross_1)
1047
1048	# ifdef USE_AS_STRNCMP
1049	/ Must check length here as length might proclude reading next*
1050	page. /*
1051	# ifdef USE_AS_WCSCMP
1052	/ NB: strcasecmp not used with WCSCMP so this access to r11 is*
1053	safe. /*
1054	movl %eax, %r11d
1055	shrl $`2`, %r11d
1056	cmpq %r11, %rdx
1057	# else
1058	cmpq %rax, %rdx
1059	# endif
1060	jbe L(ret_zero_in_loop_page_cross)
1061	# endif
1062
1063	/ Finish the loop. /
1064	VMOVA (VEC_SIZE * `2`)(%rdi), %VMM(`4`)
1065	VMOVA (VEC_SIZE * `3`)(%rdi), %VMM(`6`)
1066	VPMINU %VMM(`4`), %VMM(`6`), %VMM(`9`)
1067	VPTESTM %VMM(`9`), %VMM(`9`), %k1
1068	# ifndef USE_AS_STRCASECMP_L
1069	vpxorq (VEC_SIZE * `2`)(%rsi), %VMM(`4`), %VMM(`5`)
1070	/ YMM6 = YMM5 \| ((VEC_SIZE * 3)(%rsi) ^ YMM6). /
1071	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rsi), %VMM(`5`), %VMM(`6`)
1072	# else
1073	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`5`)
1074	TOLOWER_VMM (%VMM(`4`), %VMM(`5`))
1075	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`7`)
1076	TOLOWER_VMM (%VMM(`6`), %VMM(`7`))
1077	vpxorq %VMM(`4`), %VMM(`5`), %VMM(`5`)
1078	vpternlogd $`0xde`, %VMM(`7`), %VMM(`5`), %VMM(`6`)
1079	# endif
1080	VPTESTNM %VMM(`6`), %VMM(`6`), %k0{%k1}
1081	KMOV %k0, %LOOP_REG
1082	TESTEQ %LOOP_REG
1083	jnz L(return_vec_2_3_end)
1084
1085	/ Best for code size to include ucond-jmp here. Would be faster*
1086	if this case is hot to duplicate the L(return_vec_2_3_end)
1087	code as fall-through and have jump back to loop on mismatch
1088	comparison. /*
1089	subq $-(VEC_SIZE * `4`), %rdi
1090	subq $-(VEC_SIZE * `4`), %rsi
1091	addl $(PAGE_SIZE - VEC_SIZE * `8`), %eax
1092	# ifdef USE_AS_STRNCMP
1093	subq $(CHAR_PER_VEC * `4`), %rdx
1094	ja L(loop_skip_page_cross_check)
1095	L(ret_zero_in_loop_page_cross):
1096	xorl %eax, %eax
1097	ret
1098	# else
1099	jmp L(loop_skip_page_cross_check)
1100	# endif
1101
1102
1103	.p2align `4`,, `10`
1104	L(return_vec_page_cross_0):
1105	addl $-VEC_SIZE, %eax
1106	L(return_vec_page_cross_1):
1107	bsf %VRCX, %VRCX
1108	# if defined USE_AS_STRNCMP \|\| defined USE_AS_WCSCMP
1109	leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1110	# ifdef USE_AS_STRNCMP
1111	# ifdef USE_AS_WCSCMP
1112	/ Must divide ecx instead of multiply rdx due to overflow. /
1113	movl %ecx, %eax
1114	shrl $`2`, %eax
1115	cmpq %rax, %rdx
1116	# else
1117	cmpq %rcx, %rdx
1118	# endif
1119	jbe L(ret_zero_in_loop_page_cross)
1120	# endif
1121	# else
1122	addl %eax, %ecx
1123	# endif
1124
1125	# ifdef USE_AS_WCSCMP
1126	movl VEC_OFFSET(%rdi, %rcx), %edx
1127	xorl %eax, %eax
1128	cmpl VEC_OFFSET(%rsi, %rcx), %edx
1129	je L(ret9)
1130	setl %al
1131	negl %eax
1132	xorl %r8d, %eax
1133	# else
1134	movzbl VEC_OFFSET(%rdi, %rcx), %eax
1135	movzbl VEC_OFFSET(%rsi, %rcx), %ecx
1136	TOLOWER_gpr (%rax, %eax)
1137	TOLOWER_gpr (%rcx, %ecx)
1138	subl %ecx, %eax
1139	xorl %r8d, %eax
1140	subl %r8d, %eax
1141	# endif
1142	L(ret9):
1143	ret
1144
1145
1146	.p2align `4`,, `10`
1147	L(page_cross):
1148	# ifndef USE_AS_STRNCMP
1149	/ If both are VEC aligned we don't need any special logic here.*
1150	Only valid for strcmp where stop condition is guaranteed to
1151	be reachable by just reading memory. /*
1152	testl $((VEC_SIZE - `1`) << `20`), %eax
1153	jz L(no_page_cross)
1154	# endif
1155
1156	movl %edi, %eax
1157	movl %esi, %ecx
1158	andl $(PAGE_SIZE - `1`), %eax
1159	andl $(PAGE_SIZE - `1`), %ecx
1160
1161	xorl %OFFSET_REG, %OFFSET_REG
1162
1163	/ Check which is closer to page cross, s1 or s2. /
1164	cmpl %eax, %ecx
1165	jg L(page_cross_s2)
1166
1167	/ The previous page cross check has false positives. Check for*
1168	true positive as page cross logic is very expensive. /*
1169	subl $(PAGE_SIZE - VEC_SIZE * `4`), %eax
1170	jbe L(no_page_cross)
1171
1172
1173	/ Set r8 to not interfere with normal return value (rdi and rsi*
1174	did not swap). /*
1175	# ifdef USE_AS_WCSCMP
1176	/ any non-zero positive value that doesn't inference with 0x1.*
1177	*/
1178	movl $`2`, %r8d
1179	# else
1180	xorl %r8d, %r8d
1181	# endif
1182
1183	/ Check if less than 1x VEC till page cross. /
1184	subl $(VEC_SIZE * `3`), %eax
1185	jg L(less_1x_vec_till_page)
1186
1187
1188	/ If more than 1x VEC till page cross, loop through safely*
1189	loadable memory until within 1x VEC of page cross. /*
1190	.p2align `4`,, `8`
1191	L(page_cross_loop):
1192	VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(`0`)
1193	VPTESTM %VMM(`0`), %VMM(`0`), %k2
1194	CMP_R1_S2_VMM (%VMM(`0`), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(`1`), %k1){%k2}
1195	KMOV %k1, %VRCX
1196	TESTEQ %VRCX
1197	jnz L(check_ret_vec_page_cross)
1198	addl $CHAR_PER_VEC, %OFFSET_REG
1199	# ifdef USE_AS_STRNCMP
1200	cmpq %OFFSET_REG64, %rdx
1201	jbe L(ret_zero_page_cross)
1202	# endif
1203	addl $VEC_SIZE, %eax
1204	jl L(page_cross_loop)
1205
1206	# ifdef USE_AS_WCSCMP
1207	shrl $`2`, %eax
1208	# endif
1209
1210
1211	subl %eax, %OFFSET_REG
1212	/ OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed*
1213	to not cross page so is safe to load. Since we have already
1214	loaded at least 1 VEC from rsi it is also guaranteed to be
1215	safe. /*
1216	VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(`0`)
1217	VPTESTM %VMM(`0`), %VMM(`0`), %k2
1218	CMP_R1_S2_VMM (%VMM(`0`), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(`1`), %k1){%k2}
1219
1220	KMOV %k1, %VRCX
1221	# ifdef USE_AS_STRNCMP
1222	leal CHAR_PER_VEC(%OFFSET_REG64), %eax
1223	cmpq %rax, %rdx
1224	jbe L(check_ret_vec_page_cross2)
1225	# ifdef USE_AS_WCSCMP
1226	addq $-(CHAR_PER_VEC * `2`), %rdx
1227	# else
1228	addq %rdi, %rdx
1229	# endif
1230	# endif
1231	TESTEQ %VRCX
1232	jz L(prepare_loop_no_len)
1233
1234	.p2align `4`,, `4`
1235	L(ret_vec_page_cross):
1236	# ifndef USE_AS_STRNCMP
1237	L(check_ret_vec_page_cross):
1238	# endif
1239	tzcnt %VRCX, %VRCX
1240	addl %OFFSET_REG, %ecx
1241	L(ret_vec_page_cross_cont):
1242	# ifdef USE_AS_WCSCMP
1243	movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
1244	xorl %eax, %eax
1245	cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
1246	je L(ret12)
1247	setl %al
1248	negl %eax
1249	xorl %r8d, %eax
1250	# else
1251	movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
1252	movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
1253	TOLOWER_gpr (%rax, %eax)
1254	TOLOWER_gpr (%rcx, %ecx)
1255	subl %ecx, %eax
1256	xorl %r8d, %eax
1257	subl %r8d, %eax
1258	# endif
1259	L(ret12):
1260	ret
1261
1262
1263	# ifdef USE_AS_STRNCMP
1264	.p2align `4`,, `10`
1265	L(check_ret_vec_page_cross2):
1266	TESTEQ %VRCX
1267	L(check_ret_vec_page_cross):
1268	tzcnt %VRCX, %VRCX
1269	addl %OFFSET_REG, %ecx
1270	cmpq %rcx, %rdx
1271	ja L(ret_vec_page_cross_cont)
1272	.p2align `4`,, `2`
1273	L(ret_zero_page_cross):
1274	xorl %eax, %eax
1275	ret
1276	# endif
1277
1278	.p2align `4`,, `4`
1279	L(page_cross_s2):
1280	/ Ensure this is a true page cross. /
1281	subl $(PAGE_SIZE - VEC_SIZE * `4`), %ecx
1282	jbe L(no_page_cross)
1283
1284
1285	movl %ecx, %eax
1286	movq %rdi, %rcx
1287	movq %rsi, %rdi
1288	movq %rcx, %rsi
1289
1290	/ set r8 to negate return value as rdi and rsi swapped. /
1291	# ifdef USE_AS_WCSCMP
1292	movl $-`4`, %r8d
1293	# else
1294	movl $-`1`, %r8d
1295	# endif
1296	xorl %OFFSET_REG, %OFFSET_REG
1297
1298	/ Check if more than 1x VEC till page cross. /
1299	subl $(VEC_SIZE * `3`), %eax
1300	jle L(page_cross_loop)
1301
1302	.p2align `4`,, `6`
1303	L(less_1x_vec_till_page):
1304	# ifdef USE_AS_WCSCMP
1305	shrl $`2`, %eax
1306	# endif
1307
1308	/ Find largest load size we can use. VEC_SIZE == 64 only check*
1309	if we can do a full ymm load. /*
1310	# if VEC_SIZE == 64
1311
1312	cmpl $((VEC_SIZE - `32`) / SIZE_OF_CHAR), %eax
1313	ja L(less_32_till_page)
1314
1315
1316	/ Use 16 byte comparison. /
1317	VMOVU (%rdi), %VMM_256(`0`)
1318	VPTESTM %VMM_256(`0`), %VMM_256(`0`), %k2
1319	CMP_R1_S2_YMM (%VMM_256(`0`), (%rsi), %VMM_256(`1`), %k1){%k2}
1320	kmovd %k1, %ecx
1321	# ifdef USE_AS_WCSCMP
1322	subl $`0xff`, %ecx
1323	# else
1324	incl %ecx
1325	# endif
1326	jnz L(check_ret_vec_page_cross)
1327	movl $((VEC_SIZE - `32`) / SIZE_OF_CHAR), %OFFSET_REG
1328	# ifdef USE_AS_STRNCMP
1329	cmpq %OFFSET_REG64, %rdx
1330	jbe L(ret_zero_page_cross_slow_case64)
1331	subl %eax, %OFFSET_REG
1332	# else
1333	/ Explicit check for 32 byte alignment. /
1334	subl %eax, %OFFSET_REG
1335	jz L(prepare_loop)
1336	# endif
1337	VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(`0`)
1338	VPTESTM %VMM_256(`0`), %VMM_256(`0`), %k2
1339	CMP_R1_S2_YMM (%VMM_256(`0`), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(`1`), %k1){%k2}
1340	kmovd %k1, %ecx
1341	# ifdef USE_AS_WCSCMP
1342	subl $`0xff`, %ecx
1343	# else
1344	incl %ecx
1345	# endif
1346	jnz L(check_ret_vec_page_cross)
1347	# ifdef USE_AS_STRNCMP
1348	addl $(`32` / SIZE_OF_CHAR), %OFFSET_REG
1349	subq %OFFSET_REG64, %rdx
1350	jbe L(ret_zero_page_cross_slow_case64)
1351	subq $-(CHAR_PER_VEC * `4`), %rdx
1352
1353	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1354	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1355	# else
1356	leaq (`32` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1357	leaq (`32` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1358	# endif
1359	jmp L(prepare_loop_aligned)
1360
1361	# ifdef USE_AS_STRNCMP
1362	.p2align `4`,, `2`
1363	L(ret_zero_page_cross_slow_case64):
1364	xorl %eax, %eax
1365	ret
1366	# endif
1367	L(less_32_till_page):
1368	# endif
1369
1370	/ Find largest load size we can use. /
1371	cmpl $((VEC_SIZE - `16`) / SIZE_OF_CHAR), %eax
1372	ja L(less_16_till_page)
1373
1374	/ Use 16 byte comparison. /
1375	vmovdqu (%rdi), %xmm0
1376	VPTESTM %xmm0, %xmm0, %k2
1377	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
1378	kmovd %k1, %ecx
1379	# ifdef USE_AS_WCSCMP
1380	subl $`0xf`, %ecx
1381	# else
1382	incw %cx
1383	# endif
1384	jnz L(check_ret_vec_page_cross)
1385
1386	movl $((VEC_SIZE - `16`) / SIZE_OF_CHAR), %OFFSET_REG
1387	# ifdef USE_AS_STRNCMP
1388	# if VEC_SIZE == 32
1389	cmpq %OFFSET_REG64, %rdx
1390	# else
1391	cmpq $(`16` / SIZE_OF_CHAR), %rdx
1392	# endif
1393	jbe L(ret_zero_page_cross_slow_case0)
1394	subl %eax, %OFFSET_REG
1395	# else
1396	/ Explicit check for 16 byte alignment. /
1397	subl %eax, %OFFSET_REG
1398	jz L(prepare_loop)
1399	# endif
1400	vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1401	VPTESTM %xmm0, %xmm0, %k2
1402	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
1403	kmovd %k1, %ecx
1404	# ifdef USE_AS_WCSCMP
1405	subl $`0xf`, %ecx
1406	# else
1407	incw %cx
1408	# endif
1409	jnz L(check_ret_vec_page_cross)
1410	# ifdef USE_AS_STRNCMP
1411	addl $(`16` / SIZE_OF_CHAR), %OFFSET_REG
1412	subq %OFFSET_REG64, %rdx
1413	jbe L(ret_zero_page_cross_slow_case0)
1414	subq $-(CHAR_PER_VEC * `4`), %rdx
1415
1416	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1417	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1418	# else
1419	leaq (`16` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1420	leaq (`16` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1421	# endif
1422	jmp L(prepare_loop_aligned)
1423
1424	# ifdef USE_AS_STRNCMP
1425	.p2align `4`,, `2`
1426	L(ret_zero_page_cross_slow_case0):
1427	xorl %eax, %eax
1428	ret
1429	# endif
1430
1431
1432	.p2align `4`,, `10`
1433	L(less_16_till_page):
1434	cmpl $((VEC_SIZE - `8`) / SIZE_OF_CHAR), %eax
1435	ja L(less_8_till_page)
1436
1437	/ Use 8 byte comparison. /
1438	vmovq (%rdi), %xmm0
1439	vmovq (%rsi), %xmm1
1440	VPTESTM %xmm0, %xmm0, %k2
1441	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1442	kmovd %k1, %ecx
1443	# ifdef USE_AS_WCSCMP
1444	subl $`0x3`, %ecx
1445	# else
1446	incb %cl
1447	# endif
1448	jnz L(check_ret_vec_page_cross)
1449
1450
1451	# ifdef USE_AS_STRNCMP
1452	cmpq $(`8` / SIZE_OF_CHAR), %rdx
1453	jbe L(ret_zero_page_cross_slow_case0)
1454	# endif
1455	movl $((VEC_SIZE - `8`) / SIZE_OF_CHAR), %OFFSET_REG
1456	subl %eax, %OFFSET_REG
1457
1458	vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1459	vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1460	VPTESTM %xmm0, %xmm0, %k2
1461	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1462	kmovd %k1, %ecx
1463	# ifdef USE_AS_WCSCMP
1464	subl $`0x3`, %ecx
1465	# else
1466	incb %cl
1467	# endif
1468	jnz L(check_ret_vec_page_cross)
1469
1470
1471	# ifdef USE_AS_STRNCMP
1472	addl $(`8` / SIZE_OF_CHAR), %OFFSET_REG
1473	subq %OFFSET_REG64, %rdx
1474	jbe L(ret_zero_page_cross_slow_case0)
1475	subq $-(CHAR_PER_VEC * `4`), %rdx
1476
1477	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1478	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1479	# else
1480	leaq (`8` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1481	leaq (`8` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1482	# endif
1483	jmp L(prepare_loop_aligned)
1484
1485
1486
1487
1488	.p2align `4`,, `10`
1489	L(less_8_till_page):
1490	# ifdef USE_AS_WCSCMP
1491	/ If using wchar then this is the only check before we reach*
1492	the page boundary. /*
1493	movl (%rdi), %eax
1494	movl (%rsi), %ecx
1495	cmpl %ecx, %eax
1496	jnz L(ret_less_8_wcs)
1497	# ifdef USE_AS_STRNCMP
1498	addq $-(CHAR_PER_VEC * `2`), %rdx
1499	/ We already checked for len <= 1 so cannot hit that case here.*
1500	*/
1501	# endif
1502	testl %eax, %eax
1503	jnz L(prepare_loop)
1504	ret
1505
1506	.p2align `4`,, `8`
1507	L(ret_less_8_wcs):
1508	setl %OFFSET_REG8
1509	negl %OFFSET_REG
1510	movl %OFFSET_REG, %eax
1511	xorl %r8d, %eax
1512	ret
1513
1514	# else
1515	cmpl $(VEC_SIZE - `4`), %eax
1516	ja L(less_4_till_page)
1517
1518	vmovd (%rdi), %xmm0
1519	vmovd (%rsi), %xmm1
1520	VPTESTM %xmm0, %xmm0, %k2
1521	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1522	kmovd %k1, %ecx
1523	subl $`0xf`, %ecx
1524	jnz L(check_ret_vec_page_cross)
1525
1526	# ifdef USE_AS_STRNCMP
1527	cmpq $`4`, %rdx
1528	jbe L(ret_zero_page_cross_slow_case1)
1529	# endif
1530	movl $((VEC_SIZE - `4`) / SIZE_OF_CHAR), %OFFSET_REG
1531	subl %eax, %OFFSET_REG
1532
1533	vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1534	vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1535	VPTESTM %xmm0, %xmm0, %k2
1536	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1537	kmovd %k1, %ecx
1538	subl $`0xf`, %ecx
1539	jnz L(check_ret_vec_page_cross)
1540	# ifdef USE_AS_STRNCMP
1541	addl $(`4` / SIZE_OF_CHAR), %OFFSET_REG
1542	subq %OFFSET_REG64, %rdx
1543	jbe L(ret_zero_page_cross_slow_case1)
1544	subq $-(CHAR_PER_VEC * `4`), %rdx
1545
1546	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1547	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1548	# else
1549	leaq (`4` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1550	leaq (`4` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1551	# endif
1552	jmp L(prepare_loop_aligned)
1553
1554
1555	# ifdef USE_AS_STRNCMP
1556	.p2align `4`,, `2`
1557	L(ret_zero_page_cross_slow_case1):
1558	xorl %eax, %eax
1559	ret
1560	# endif
1561
1562	.p2align `4`,, `10`
1563	L(less_4_till_page):
1564	subq %rdi, %rsi
1565	/ Extremely slow byte comparison loop. /
1566	L(less_4_loop):
1567	movzbl (%rdi), %eax
1568	movzbl (%rsi, %rdi), %ecx
1569	TOLOWER_gpr (%rax, %eax)
1570	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1571	subl %BYTE_LOOP_REG, %eax
1572	jnz L(ret_less_4_loop)
1573	testl %ecx, %ecx
1574	jz L(ret_zero_4_loop)
1575	# ifdef USE_AS_STRNCMP
1576	decq %rdx
1577	jz L(ret_zero_4_loop)
1578	# endif
1579	incq %rdi
1580	/ end condition is reach page boundary (rdi is aligned). /
1581	testb $(VEC_SIZE - `1`), %dil
1582	jnz L(less_4_loop)
1583	leaq -(VEC_SIZE * `4`)(%rdi, %rsi), %rsi
1584	addq $-(VEC_SIZE * `4`), %rdi
1585	# ifdef USE_AS_STRNCMP
1586	subq $-(CHAR_PER_VEC * `4`), %rdx
1587	# endif
1588	jmp L(prepare_loop_aligned)
1589
1590	L(ret_zero_4_loop):
1591	xorl %eax, %eax
1592	ret
1593	L(ret_less_4_loop):
1594	xorl %r8d, %eax
1595	subl %r8d, %eax
1596	ret
1597	# endif
1598	cfi_endproc
1599	.size STRCMP, .-STRCMP
1600	#endif
1601

Browse the source code of glibc/sysdeps/x86_64/multiarch/strcmp-evex.S