memchr-evex.S source code [glibc/sysdeps/x86_64/multiarch/memchr-evex.S]

1	/ memchr/wmemchr optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2021-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20	#include <sysdep.h>
21
22	#if ISA_SHOULD_BUILD (4)
23
24	# ifndef VEC_SIZE
25	# include "x86-evex256-vecs.h"
26	# endif
27
28	# ifndef MEMCHR
29	# define MEMCHR __memchr_evex
30	# endif
31
32	# ifdef USE_AS_WMEMCHR
33	# define PC_SHIFT_GPR rcx
34	# define VPTESTN vptestnmd
35	# define VPBROADCAST vpbroadcastd
36	# define VPMINU vpminud
37	# define VPCMP vpcmpd
38	# define VPCMPEQ vpcmpeqd
39	# define CHAR_SIZE 4
40
41	# define USE_WIDE_CHAR
42	# else
43	# define PC_SHIFT_GPR rdi
44	# define VPTESTN vptestnmb
45	# define VPBROADCAST vpbroadcastb
46	# define VPMINU vpminub
47	# define VPCMP vpcmpb
48	# define VPCMPEQ vpcmpeqb
49	# define CHAR_SIZE 1
50	# endif
51
52	# include "reg-macros.h"
53
54
55	/ If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64*
56	doesn't have VEX encoding), use VEX encoding in loop so we
57	can use vpcmpeqb + vptern which is more efficient than the
58	EVEX alternative. /*
59	# if defined USE_IN_RTM \|\| VEC_SIZE == 64
60	# undef COND_VZEROUPPER
61	# undef VZEROUPPER_RETURN
62	# undef VZEROUPPER
63
64	# define COND_VZEROUPPER
65	# define VZEROUPPER_RETURN ret
66	# define VZEROUPPER
67
68	# define USE_TERN_IN_LOOP 0
69	# else
70	# define USE_TERN_IN_LOOP 1
71	# undef VZEROUPPER
72	# define VZEROUPPER vzeroupper
73	# endif
74
75	# if USE_TERN_IN_LOOP
76	/ Resulting bitmask for vpmovmskb has 4-bits set for each wchar*
77	so we don't want to multiply resulting index. /*
78	# define TERN_CHAR_MULT 1
79
80	# ifdef USE_AS_WMEMCHR
81	# define TEST_END() inc %VRCX
82	# else
83	# define TEST_END() add %rdx, %rcx
84	# endif
85	# else
86	# define TERN_CHAR_MULT CHAR_SIZE
87	# define TEST_END() KORTEST %k2, %k3
88	# endif
89
90	# if defined USE_AS_WMEMCHR \|\| !USE_TERN_IN_LOOP
91	# ifndef USE_AS_WMEMCHR
92	# define GPR_X0_IS_RET 1
93	# else
94	# define GPR_X0_IS_RET 0
95	# endif
96	# define GPR_X0 rax
97	# else
98	# define GPR_X0_IS_RET 0
99	# define GPR_X0 rdx
100	# endif
101
102	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
103
104	# if CHAR_PER_VEC == 64
105	# define LAST_VEC_OFFSET (VEC_SIZE * 3)
106	# else
107	# define LAST_VEC_OFFSET (VEC_SIZE * 2)
108	# endif
109	# if CHAR_PER_VEC >= 32
110	# define MASK_GPR(...) VGPR(__VA_ARGS__)
111	# elif CHAR_PER_VEC == 16
112	# define MASK_GPR(reg) VGPR_SZ(reg, 16)
113	# else
114	# define MASK_GPR(reg) VGPR_SZ(reg, 8)
115	# endif
116
117	# define VMATCH VMM(0)
118	# define VMATCH_LO VMM_lo(0)
119
120	# define PAGE_SIZE 4096
121
122
123	.section SECTION(.text), "ax", @progbits
124	ENTRY_P2ALIGN (MEMCHR, `6`)
125	/ Check for zero length. /
126	test %RDX_LP, %RDX_LP
127	jz L(zero_0)
128
129	# ifdef __ILP32__
130	/ Clear the upper 32 bits. /
131	movl %edx, %edx
132	# endif
133	VPBROADCAST %esi, %VMATCH
134	/ Check if we may cross page boundary with one vector load. /
135	movl %edi, %eax
136	andl $(PAGE_SIZE - `1`), %eax
137	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
138	ja L(page_cross)
139
140	VPCMPEQ (%rdi), %VMATCH, %k0
141	KMOV %k0, %VRAX
142	# ifndef USE_AS_WMEMCHR
143	/ If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a*
144	already a dependency between rcx and rsi so no worries about
145	false-dep here. /*
146	tzcnt %VRAX, %VRSI
147	/ If rdx <= rsi then either 1) rcx was non-zero (there was a*
148	match) but it was out of bounds or 2) rcx was zero and rdx
149	was <= VEC_SIZE so we are done scanning. /*
150	cmpq %rsi, %rdx
151	/ NB: Use branch to return zero/non-zero. Common usage will*
152	branch on result of function (if return is null/non-null).
153	This branch can be used to predict the ensuing one so there
154	is no reason to extend the data-dependency with cmovcc. /*
155	jbe L(zero_0)
156
157	/ If rcx is zero then len must be > RDX, otherwise since we*
158	already tested len vs lzcnt(rcx) (in rsi) we are good to
159	return this match. /*
160	test %VRAX, %VRAX
161	jz L(more_1x_vec)
162	leaq (%rdi, %rsi), %rax
163	# else
164
165	/ We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE*
166	> 1 so if rcx is tzcnt != CHAR_PER_VEC. /*
167	cmpq $CHAR_PER_VEC, %rdx
168	ja L(more_1x_vec)
169	tzcnt %VRAX, %VRAX
170	cmpl %eax, %edx
171	jbe L(zero_0)
172	L(first_vec_x0_ret):
173	leaq (%rdi, %rax, CHAR_SIZE), %rax
174	# endif
175	ret
176
177	/ Only fits in first cache line for VEC_SIZE == 32. /
178	# if VEC_SIZE == 32
179	.p2align `4`,, `2`
180	L(zero_0):
181	xorl %eax, %eax
182	ret
183	# endif
184
185	.p2align `4`,, `9`
186	L(more_1x_vec):
187	# ifdef USE_AS_WMEMCHR
188	/ If wmemchr still need to test if there was a match in first*
189	VEC. Use bsf to test here so we can reuse
190	L(first_vec_x0_ret). /*
191	bsf %VRAX, %VRAX
192	jnz L(first_vec_x0_ret)
193	# endif
194
195	L(page_cross_continue):
196	# ifdef USE_AS_WMEMCHR
197	/ We can't use end of the buffer to re-calculate length for*
198	wmemchr as len CHAR_SIZE may overflow. /
199	leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
200	andq $(VEC_SIZE * -`1`), %rdi
201	subq %rdi, %rax
202	sarq $`2`, %rax
203	addq %rdx, %rax
204	# else
205	leaq -(VEC_SIZE + `1`)(%rdx, %rdi), %rax
206	andq $(VEC_SIZE * -`1`), %rdi
207	subq %rdi, %rax
208	# endif
209
210	/ rax contains remaining length - 1. -1 so we can get imm8*
211	encoding in a few additional places saving code size. /*
212
213	/ Needed regardless of remaining length. /
214	VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
215	KMOV %k0, %VRDX
216
217	/ We cannot fold the above `sub %rdi, %rax` with the `cmp*
218	$(CHAR_PER_VEC 2), %rax` because its possible for a very*
219	large length to overflow and cause the subtract to carry
220	despite length being above CHAR_PER_VEC 2. /
221	cmpq $(CHAR_PER_VEC * `2` - `1`), %rax
222	ja L(more_2x_vec)
223	L(last_2x_vec):
224
225	test %VRDX, %VRDX
226	jnz L(first_vec_x1_check)
227
228	/ Check the end of data. NB: use 8-bit operations to save code*
229	size. We no longer need the full-width of eax and will
230	perform a write-only operation over eax so there will be no
231	partial-register stalls. /*
232	subb $(CHAR_PER_VEC * `1` - `1`), %al
233	jle L(zero_0)
234
235	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VMATCH, %k0
236	KMOV %k0, %VRCX
237	# ifdef USE_AS_WMEMCHR
238	/ For wmemchr against we can't take advantage of tzcnt(0) ==*
239	VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. /*
240	test %VRCX, %VRCX
241	jz L(zero_0)
242	# endif
243	tzcnt %VRCX, %VRCX
244	cmp %cl, %al
245
246	/ Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give*
247	fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
248	not enough space before the next cache line to fit the `lea`
249	for return. /*
250	# if VEC_SIZE == 64
251	ja L(first_vec_x2_ret)
252	L(zero_0):
253	xorl %eax, %eax
254	ret
255	# else
256	jbe L(zero_0)
257	leaq (VEC_SIZE * `2`)(%rdi, %rcx, CHAR_SIZE), %rax
258	ret
259	# endif
260
261	.p2align `4`,, `5`
262	L(first_vec_x1_check):
263	bsf %VRDX, %VRDX
264	cmpb %dl, %al
265	jb L(zero_4)
266	leaq (VEC_SIZE * `1`)(%rdi, %rdx, CHAR_SIZE), %rax
267	ret
268
269	/ Fits at the end of the cache line here for VEC_SIZE == 32.*
270	*/
271	# if VEC_SIZE == 32
272	L(zero_4):
273	xorl %eax, %eax
274	ret
275	# endif
276
277
278	.p2align `4`,, `4`
279	L(first_vec_x2):
280	bsf %VRCX, %VRCX
281	L(first_vec_x2_ret):
282	leaq (VEC_SIZE * `2`)(%rdi, %rcx, CHAR_SIZE), %rax
283	ret
284
285	/ Fits at the end of the cache line here for VEC_SIZE == 64.*
286	*/
287	# if VEC_SIZE == 64
288	L(zero_4):
289	xorl %eax, %eax
290	ret
291	# endif
292
293	.p2align `4`,, `4`
294	L(first_vec_x1):
295	bsf %VRDX, %VRDX
296	leaq (VEC_SIZE * `1`)(%rdi, %rdx, CHAR_SIZE), %rax
297	ret
298
299
300	.p2align `4`,, `5`
301	L(more_2x_vec):
302	/ Length > VEC_SIZE * 2 so check first 2x VEC before rechecking*
303	length. /*
304
305
306	/ Already computed matches for first VEC in rdx. /
307	test %VRDX, %VRDX
308	jnz L(first_vec_x1)
309
310
311	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VMATCH, %k0
312	KMOV %k0, %VRCX
313	test %VRCX, %VRCX
314	jnz L(first_vec_x2)
315
316	/ Needed regardless of next length check. /
317	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VMATCH, %k0
318	KMOV %k0, %VRCX
319
320	/ Check if we are near the end. /
321	cmpq $(CHAR_PER_VEC * `4` - `1`), %rax
322	ja L(more_4x_vec)
323
324	test %VRCX, %VRCX
325	jnz L(first_vec_x3_check)
326
327	/ Use 8-bit instructions to save code size. We won't use full-*
328	width eax again and will perform a write-only operation to
329	eax so no worries about partial-register stalls. /*
330	subb $(CHAR_PER_VEC * `3`), %al
331	jb L(zero_2)
332	L(last_vec_check):
333	VPCMPEQ (VEC_SIZE * `4`)(%rdi), %VMATCH, %k0
334	KMOV %k0, %VRCX
335	# ifdef USE_AS_WMEMCHR
336	/ For wmemchr against we can't take advantage of tzcnt(0) ==*
337	VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. /*
338	test %VRCX, %VRCX
339	jz L(zero_2)
340	# endif
341	tzcnt %VRCX, %VRCX
342	cmp %cl, %al
343	jae L(first_vec_x4_ret)
344	L(zero_2):
345	xorl %eax, %eax
346	ret
347
348	/ Fits at the end of the cache line here for VEC_SIZE == 64.*
349	For VEC_SIZE == 32 we put the return label at the end of
350	L(first_vec_x4). /*
351	# if VEC_SIZE == 64
352	L(first_vec_x4_ret):
353	leaq (VEC_SIZE * `4`)(%rdi, %rcx, CHAR_SIZE), %rax
354	ret
355	# endif
356
357	.p2align `4`,, `6`
358	L(first_vec_x4):
359	bsf %VRCX, %VRCX
360	# if VEC_SIZE == 32
361	/ Place L(first_vec_x4_ret) here as we can't fit it in the same*
362	cache line as where it is called from so we might as well
363	save code size by reusing return of L(first_vec_x4). /*
364	L(first_vec_x4_ret):
365	# endif
366	leaq (VEC_SIZE * `4`)(%rdi, %rcx, CHAR_SIZE), %rax
367	ret
368
369	.p2align `4`,, `6`
370	L(first_vec_x3_check):
371	/ Need to adjust remaining length before checking. /
372	addb $-(CHAR_PER_VEC * `2`), %al
373	bsf %VRCX, %VRCX
374	cmpb %cl, %al
375	jb L(zero_2)
376	leaq (VEC_SIZE * `3`)(%rdi, %rcx, CHAR_SIZE), %rax
377	ret
378
379	.p2align `4`,, `6`
380	L(first_vec_x3):
381	bsf %VRCX, %VRCX
382	leaq (VEC_SIZE * `3`)(%rdi, %rcx, CHAR_SIZE), %rax
383	ret
384
385	.p2align `4`,, `3`
386	# if !USE_TERN_IN_LOOP
387	.p2align `4`,, `10`
388	# endif
389	L(more_4x_vec):
390	test %VRCX, %VRCX
391	jnz L(first_vec_x3)
392
393	VPCMPEQ (VEC_SIZE * `4`)(%rdi), %VMATCH, %k0
394	KMOV %k0, %VRCX
395	test %VRCX, %VRCX
396	jnz L(first_vec_x4)
397
398	subq $-(VEC_SIZE * `5`), %rdi
399	subq $(CHAR_PER_VEC * `8`), %rax
400	jb L(last_4x_vec)
401
402	# ifdef USE_AS_WMEMCHR
403	movl %edi, %ecx
404	# else
405	addq %rdi, %rax
406	# endif
407
408
409	# if VEC_SIZE == 64
410	/ use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex*
411	processor has partial register stalls (all have merging
412	uop). If that changes this can be removed. /*
413	xorb %dil, %dil
414	# else
415	andq $-(VEC_SIZE * `4`), %rdi
416	# endif
417
418	# ifdef USE_AS_WMEMCHR
419	subl %edi, %ecx
420	sarl $`2`, %ecx
421	addq %rcx, %rax
422	# else
423	subq %rdi, %rax
424	# endif
425
426
427
428	# if USE_TERN_IN_LOOP
429	/ copy VMATCH to low ymm so we can use vpcmpeq which is not*
430	encodable with EVEX registers. NB: this is VEC_SIZE == 32
431	only as there is no way to encode vpcmpeq with zmm0-15. /*
432	vmovdqa64 %VMATCH, %VMATCH_LO
433	# endif
434
435	.p2align `4`,, `11`
436	L(loop_4x_vec):
437	/ Two versions of the loop. One that does not require*
438	vzeroupper by not using ymmm0-15 and another does that
439	require vzeroupper because it uses ymmm0-15. The reason why
440	ymm0-15 is used at all is because there is no EVEX encoding
441	vpcmpeq and with vpcmpeq this loop can be performed more
442	efficiently. The non-vzeroupper version is safe for RTM
443	while the vzeroupper version should be prefered if RTM are
444	not supported. Which loop version we use is determined by
445	USE_TERN_IN_LOOP. /*
446
447	# if USE_TERN_IN_LOOP
448	/ Since vptern can only take 3x vectors fastest to do 1 vec*
449	seperately with EVEX vpcmp. /*
450	# ifdef USE_AS_WMEMCHR
451	/ vptern can only accept masks for epi32/epi64 so can only save*
452	instruction using not equals mask on vptern with wmemchr.
453	*/
454	VPCMP $`4`, (VEC_SIZE * `0`)(%rdi), %VMATCH, %k1
455	# else
456	VPCMPEQ (VEC_SIZE * `0`)(%rdi), %VMATCH, %k1
457	# endif
458	/ Compare 3x with vpcmpeq and or them all together with vptern.*
459	*/
460	VPCMPEQ (VEC_SIZE * `1`)(%rdi), %VMATCH_LO, %VMM_lo(`2`)
461	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VMATCH_LO, %VMM_lo(`3`)
462	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VMATCH_LO, %VMM_lo(`4`)
463	# ifdef USE_AS_WMEMCHR
464	/ This takes the not of or between VEC_lo(2), VEC_lo(3),*
465	VEC_lo(4) as well as combines result from VEC(0) with zero
466	mask. /*
467	vpternlogd $`1`, %VMM_lo(`2`), %VMM_lo(`3`), %VMM_lo(`4`){%k1}{z}
468	vpmovmskb %VMM_lo(`4`), %VRCX
469	# else
470	/ 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into*
471	VEC_lo(4). /*
472	vpternlogd $`254`, %VMM_lo(`2`), %VMM_lo(`3`), %VMM_lo(`4`)
473	vpmovmskb %VMM_lo(`4`), %VRCX
474	KMOV %k1, %edx
475	# endif
476
477	# else
478	/ Loop version that uses EVEX encoding. /
479	VPCMP $`4`, (VEC_SIZE * `0`)(%rdi), %VMATCH, %k1
480	vpxorq (VEC_SIZE * `1`)(%rdi), %VMATCH, %VMM(`2`)
481	vpxorq (VEC_SIZE * `2`)(%rdi), %VMATCH, %VMM(`3`)
482	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VMATCH, %k3
483	VPMINU %VMM(`2`), %VMM(`3`), %VMM(`3`){%k1}{z}
484	VPTESTN %VMM(`3`), %VMM(`3`), %k2
485	# endif
486
487
488	TEST_END ()
489	jnz L(loop_vec_ret)
490
491	subq $-(VEC_SIZE * `4`), %rdi
492
493	subq $(CHAR_PER_VEC * `4`), %rax
494	jae L(loop_4x_vec)
495
496	/ COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.*
497	*/
498	COND_VZEROUPPER
499
500	.p2align `4`,, `10`
501	L(last_4x_vec):
502	/ For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit*
503	instructions on eax from here on out. /*
504	# if CHAR_PER_VEC != 64
505	andl $(CHAR_PER_VEC * `4` - `1`), %eax
506	# endif
507	VPCMPEQ (VEC_SIZE * `0`)(%rdi), %VMATCH, %k0
508	subq $(VEC_SIZE * `1`), %rdi
509	KMOV %k0, %VRDX
510	cmpb $(CHAR_PER_VEC * `2` - `1`), %al
511	jbe L(last_2x_vec)
512	test %VRDX, %VRDX
513	jnz L(last_vec_x1_novzero)
514
515	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VMATCH, %k0
516	KMOV %k0, %VRDX
517	test %VRDX, %VRDX
518	jnz L(last_vec_x2_novzero)
519
520	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VMATCH, %k0
521	KMOV %k0, %VRCX
522	test %VRCX, %VRCX
523	jnz L(first_vec_x3_check)
524
525	subb $(CHAR_PER_VEC * `3`), %al
526	jae L(last_vec_check)
527
528	xorl %eax, %eax
529	ret
530
531	# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
532	L(last_vec_x2_novzero):
533	addq $VEC_SIZE, %rdi
534	L(last_vec_x1_novzero):
535	bsf %VRDX, %VRDX
536	leaq (VEC_SIZE * `1`)(%rdi, %rdx, CHAR_SIZE), %rax
537	ret
538	# endif
539
540	# if CHAR_PER_VEC == 64
541	/ Since we can't combine the last 2x VEC when CHAR_PER_VEC ==*
542	64 it needs a seperate return label. /*
543	.p2align `4`,, `4`
544	L(last_vec_x2):
545	L(last_vec_x2_novzero):
546	bsf %VRDX, %VRDX
547	leaq (VEC_SIZE * `2`)(%rdi, %rdx, TERN_CHAR_MULT), %rax
548	ret
549	# endif
550
551	.p2align `4`,, `4`
552	L(loop_vec_ret):
553	# if defined USE_AS_WMEMCHR \|\| !USE_TERN_IN_LOOP
554	KMOV %k1, %VRAX
555	inc %MASK_GPR(rax)
556	# else
557	test %VRDX, %VRDX
558	# endif
559	jnz L(last_vec_x0)
560
561
562	# if USE_TERN_IN_LOOP
563	vpmovmskb %VMM_lo(`2`), %VRDX
564	# else
565	VPTESTN %VMM(`2`), %VMM(`2`), %k1
566	KMOV %k1, %VRDX
567	# endif
568	test %VRDX, %VRDX
569	jnz L(last_vec_x1)
570
571
572	# if USE_TERN_IN_LOOP
573	vpmovmskb %VMM_lo(`3`), %VRDX
574	# else
575	KMOV %k2, %VRDX
576	# endif
577
578	/ No longer need any of the lo vecs (ymm0-15) so vzeroupper*
579	(only if used VEX encoded loop). /*
580	COND_VZEROUPPER
581
582	/ Seperate logic for CHAR_PER_VEC == 64 vs the rest. For*
583	CHAR_PER_VEC we test the last 2x VEC seperately, for
584	CHAR_PER_VEC <= 32 we can combine the results from the 2x
585	VEC in a single GPR. /*
586	# if CHAR_PER_VEC == 64
587	# if USE_TERN_IN_LOOP
588	# error "Unsupported"
589	# endif
590
591
592	/ If CHAR_PER_VEC == 64 we can't combine the last two VEC. /
593	test %VRDX, %VRDX
594	jnz L(last_vec_x2)
595	KMOV %k3, %VRDX
596	# else
597	/ CHAR_PER_VEC <= 32 so we can combine the results from the*
598	last 2x VEC. /*
599
600	# if !USE_TERN_IN_LOOP
601	KMOV %k3, %VRCX
602	# endif
603	salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx
604	addq %rcx, %rdx
605	# if !defined USE_AS_WMEMCHR \|\| !USE_TERN_IN_LOOP
606	L(last_vec_x2_novzero):
607	# endif
608	# endif
609	bsf %rdx, %rdx
610	leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
611	ret
612
613	.p2align `4`,, `8`
614	L(last_vec_x1):
615	COND_VZEROUPPER
616	# if !defined USE_AS_WMEMCHR \|\| !USE_TERN_IN_LOOP
617	L(last_vec_x1_novzero):
618	# endif
619	bsf %VRDX, %VRDX
620	leaq (VEC_SIZE * `1`)(%rdi, %rdx, TERN_CHAR_MULT), %rax
621	ret
622
623
624	.p2align `4`,, `4`
625	L(last_vec_x0):
626	COND_VZEROUPPER
627	bsf %VGPR(GPR_X0), %VGPR(GPR_X0)
628	# if GPR_X0_IS_RET
629	addq %rdi, %rax
630	# else
631	leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax
632	# endif
633	ret
634
635	.p2align `4`,, `6`
636	L(page_cross):
637	/ Need to preserve eax to compute inbound bytes we are*
638	checking. /*
639	# ifdef USE_AS_WMEMCHR
640	movl %eax, %ecx
641	# else
642	xorl %ecx, %ecx
643	subl %eax, %ecx
644	# endif
645
646	xorq %rdi, %rax
647	VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
648	KMOV %k0, %VRAX
649
650	# ifdef USE_AS_WMEMCHR
651	/ NB: Divide by CHAR_SIZE to shift out out of bounds bytes. /
652	shrl $`2`, %ecx
653	andl $(CHAR_PER_VEC - `1`), %ecx
654	# endif
655
656
657	shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
658
659	# ifdef USE_AS_WMEMCHR
660	negl %ecx
661	# endif
662
663	/ mask lower bits from ecx (negative eax) to get bytes till*
664	next VEC. /*
665	andl $(CHAR_PER_VEC - `1`), %ecx
666
667	/ Check if VEC is entirely contained in the remainder of the*
668	page. /*
669	cmpq %rcx, %rdx
670	jbe L(page_cross_ret)
671
672	/ Length crosses the page so if rax is zero (no matches)*
673	continue. /*
674	test %VRAX, %VRAX
675	jz L(page_cross_continue)
676
677	/ if rdx > rcx then any match here must be in [buf:buf + len].*
678	*/
679	tzcnt %VRAX, %VRAX
680	# ifdef USE_AS_WMEMCHR
681	leaq (%rdi, %rax, CHAR_SIZE), %rax
682	# else
683	addq %rdi, %rax
684	# endif
685	ret
686
687	.p2align `4`,, `2`
688	L(page_cross_zero):
689	xorl %eax, %eax
690	ret
691
692	.p2align `4`,, `4`
693	L(page_cross_ret):
694	/ Search is entirely contained in page cross case. /
695	# ifdef USE_AS_WMEMCHR
696	test %VRAX, %VRAX
697	jz L(page_cross_zero)
698	# endif
699	tzcnt %VRAX, %VRAX
700	cmpl %eax, %edx
701	jbe L(page_cross_zero)
702	# ifdef USE_AS_WMEMCHR
703	leaq (%rdi, %rax, CHAR_SIZE), %rax
704	# else
705	addq %rdi, %rax
706	# endif
707	ret
708	END (MEMCHR)
709	#endif
710

Browse the source code of glibc/sysdeps/x86_64/multiarch/memchr-evex.S