1/* memchr/wmemchr optimized with AVX2.
2 Copyright (C) 2017-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCHR
24# define MEMCHR __memchr_avx2
25# endif
26
27# ifdef USE_AS_WMEMCHR
28# define VPCMPEQ vpcmpeqd
29# define VPBROADCAST vpbroadcastd
30# define CHAR_SIZE 4
31# else
32# define VPCMPEQ vpcmpeqb
33# define VPBROADCAST vpbroadcastb
34# define CHAR_SIZE 1
35# endif
36
37# ifdef USE_AS_RAWMEMCHR
38# define ERAW_PTR_REG ecx
39# define RRAW_PTR_REG rcx
40# define ALGN_PTR_REG rdi
41# else
42# define ERAW_PTR_REG edi
43# define RRAW_PTR_REG rdi
44# define ALGN_PTR_REG rcx
45# endif
46
47# ifndef VZEROUPPER
48# define VZEROUPPER vzeroupper
49# endif
50
51# ifndef SECTION
52# define SECTION(p) p##.avx
53# endif
54
55# define VEC_SIZE 32
56# define PAGE_SIZE 4096
57# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
58
59 .section SECTION(.text),"ax",@progbits
60ENTRY (MEMCHR)
61# ifndef USE_AS_RAWMEMCHR
62 /* Check for zero length. */
63# ifdef __ILP32__
64 /* Clear upper bits. */
65 and %RDX_LP, %RDX_LP
66# else
67 test %RDX_LP, %RDX_LP
68# endif
69 jz L(null)
70# endif
71 /* Broadcast CHAR to YMMMATCH. */
72 vmovd %esi, %xmm0
73 VPBROADCAST %xmm0, %ymm0
74 /* Check if we may cross page boundary with one vector load. */
75 movl %edi, %eax
76 andl $(PAGE_SIZE - 1), %eax
77 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
78 ja L(cross_page_boundary)
79
80 /* Check the first VEC_SIZE bytes. */
81 VPCMPEQ (%rdi), %ymm0, %ymm1
82 vpmovmskb %ymm1, %eax
83# ifndef USE_AS_RAWMEMCHR
84 /* If length < CHAR_PER_VEC handle special. */
85 cmpq $CHAR_PER_VEC, %rdx
86 jbe L(first_vec_x0)
87# endif
88 testl %eax, %eax
89 jz L(aligned_more)
90 tzcntl %eax, %eax
91 addq %rdi, %rax
92 VZEROUPPER_RETURN
93
94# ifndef USE_AS_RAWMEMCHR
95 .p2align 5
96L(first_vec_x0):
97 /* Check if first match was before length. */
98 tzcntl %eax, %eax
99# ifdef USE_AS_WMEMCHR
100 /* NB: Multiply length by 4 to get byte count. */
101 sall $2, %edx
102# endif
103 xorl %ecx, %ecx
104 cmpl %eax, %edx
105 leaq (%rdi, %rax), %rax
106 cmovle %rcx, %rax
107 VZEROUPPER_RETURN
108
109L(null):
110 xorl %eax, %eax
111 ret
112# endif
113 .p2align 4
114L(cross_page_boundary):
115 /* Save pointer before aligning as its original value is
116 necessary for computer return address if byte is found or
117 adjusting length if it is not and this is memchr. */
118 movq %rdi, %rcx
119 /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
120 and rdi for rawmemchr. */
121 orq $(VEC_SIZE - 1), %ALGN_PTR_REG
122 VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
123 vpmovmskb %ymm1, %eax
124# ifndef USE_AS_RAWMEMCHR
125 /* Calculate length until end of page (length checked for a
126 match). */
127 leaq 1(%ALGN_PTR_REG), %rsi
128 subq %RRAW_PTR_REG, %rsi
129# ifdef USE_AS_WMEMCHR
130 /* NB: Divide bytes by 4 to get wchar_t count. */
131 shrl $2, %esi
132# endif
133# endif
134 /* Remove the leading bytes. */
135 sarxl %ERAW_PTR_REG, %eax, %eax
136# ifndef USE_AS_RAWMEMCHR
137 /* Check the end of data. */
138 cmpq %rsi, %rdx
139 jbe L(first_vec_x0)
140# endif
141 testl %eax, %eax
142 jz L(cross_page_continue)
143 tzcntl %eax, %eax
144 addq %RRAW_PTR_REG, %rax
145L(return_vzeroupper):
146 ZERO_UPPER_VEC_REGISTERS_RETURN
147
148 .p2align 4
149L(first_vec_x1):
150 tzcntl %eax, %eax
151 incq %rdi
152 addq %rdi, %rax
153 VZEROUPPER_RETURN
154
155 .p2align 4
156L(first_vec_x2):
157 tzcntl %eax, %eax
158 addq $(VEC_SIZE + 1), %rdi
159 addq %rdi, %rax
160 VZEROUPPER_RETURN
161
162 .p2align 4
163L(first_vec_x3):
164 tzcntl %eax, %eax
165 addq $(VEC_SIZE * 2 + 1), %rdi
166 addq %rdi, %rax
167 VZEROUPPER_RETURN
168
169
170 .p2align 4
171L(first_vec_x4):
172 tzcntl %eax, %eax
173 addq $(VEC_SIZE * 3 + 1), %rdi
174 addq %rdi, %rax
175 VZEROUPPER_RETURN
176
177 .p2align 4
178L(aligned_more):
179 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
180 since data is only aligned to VEC_SIZE. */
181
182# ifndef USE_AS_RAWMEMCHR
183L(cross_page_continue):
184 /* Align data to VEC_SIZE - 1. */
185 xorl %ecx, %ecx
186 subl %edi, %ecx
187 orq $(VEC_SIZE - 1), %rdi
188 /* esi is for adjusting length to see if near the end. */
189 leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
190# ifdef USE_AS_WMEMCHR
191 /* NB: Divide bytes by 4 to get the wchar_t count. */
192 sarl $2, %esi
193# endif
194# else
195 orq $(VEC_SIZE - 1), %rdi
196L(cross_page_continue):
197# endif
198 /* Load first VEC regardless. */
199 VPCMPEQ 1(%rdi), %ymm0, %ymm1
200 vpmovmskb %ymm1, %eax
201# ifndef USE_AS_RAWMEMCHR
202 /* Adjust length. If near end handle specially. */
203 subq %rsi, %rdx
204 jbe L(last_4x_vec_or_less)
205# endif
206 testl %eax, %eax
207 jnz L(first_vec_x1)
208
209 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
210 vpmovmskb %ymm1, %eax
211 testl %eax, %eax
212 jnz L(first_vec_x2)
213
214 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
215 vpmovmskb %ymm1, %eax
216 testl %eax, %eax
217 jnz L(first_vec_x3)
218
219 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
220 vpmovmskb %ymm1, %eax
221 testl %eax, %eax
222 jnz L(first_vec_x4)
223
224# ifndef USE_AS_RAWMEMCHR
225 /* Check if at last VEC_SIZE * 4 length. */
226 subq $(CHAR_PER_VEC * 4), %rdx
227 jbe L(last_4x_vec_or_less_cmpeq)
228 /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
229 length. */
230 incq %rdi
231 movl %edi, %ecx
232 orq $(VEC_SIZE * 4 - 1), %rdi
233 andl $(VEC_SIZE * 4 - 1), %ecx
234# ifdef USE_AS_WMEMCHR
235 /* NB: Divide bytes by 4 to get the wchar_t count. */
236 sarl $2, %ecx
237# endif
238 addq %rcx, %rdx
239# else
240 /* Align data to VEC_SIZE * 4 - 1 for loop. */
241 incq %rdi
242 orq $(VEC_SIZE * 4 - 1), %rdi
243# endif
244
245 /* Compare 4 * VEC at a time forward. */
246 .p2align 4
247L(loop_4x_vec):
248 VPCMPEQ 1(%rdi), %ymm0, %ymm1
249 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
250 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
251 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
252 vpor %ymm1, %ymm2, %ymm5
253 vpor %ymm3, %ymm4, %ymm6
254 vpor %ymm5, %ymm6, %ymm5
255
256 vpmovmskb %ymm5, %ecx
257# ifdef USE_AS_RAWMEMCHR
258 subq $-(VEC_SIZE * 4), %rdi
259 testl %ecx, %ecx
260 jz L(loop_4x_vec)
261# else
262 testl %ecx, %ecx
263 jnz L(loop_4x_vec_end)
264
265 subq $-(VEC_SIZE * 4), %rdi
266
267 subq $(CHAR_PER_VEC * 4), %rdx
268 ja L(loop_4x_vec)
269
270 /* Fall through into less than 4 remaining vectors of length
271 case. */
272 VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
273 vpmovmskb %ymm1, %eax
274 .p2align 4
275L(last_4x_vec_or_less):
276# ifdef USE_AS_WMEMCHR
277 /* NB: Multiply length by 4 to get byte count. */
278 sall $2, %edx
279# endif
280 /* Check if first VEC contained match. */
281 testl %eax, %eax
282 jnz L(first_vec_x1_check)
283
284 /* If remaining length > VEC_SIZE * 2. */
285 addl $(VEC_SIZE * 2), %edx
286 jg L(last_4x_vec)
287
288L(last_2x_vec):
289 /* If remaining length < VEC_SIZE. */
290 addl $VEC_SIZE, %edx
291 jle L(zero_end)
292
293 /* Check VEC2 and compare any match with remaining length. */
294 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
295 vpmovmskb %ymm1, %eax
296 tzcntl %eax, %eax
297 cmpl %eax, %edx
298 jbe L(set_zero_end)
299 addq $(VEC_SIZE + 1), %rdi
300 addq %rdi, %rax
301L(zero_end):
302 VZEROUPPER_RETURN
303
304 .p2align 4
305L(loop_4x_vec_end):
306# endif
307 /* rawmemchr will fall through into this if match was found in
308 loop. */
309
310 vpmovmskb %ymm1, %eax
311 testl %eax, %eax
312 jnz L(last_vec_x1_return)
313
314 vpmovmskb %ymm2, %eax
315 testl %eax, %eax
316 jnz L(last_vec_x2_return)
317
318 vpmovmskb %ymm3, %eax
319 /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
320 salq $32, %rcx
321 orq %rcx, %rax
322 tzcntq %rax, %rax
323# ifdef USE_AS_RAWMEMCHR
324 subq $(VEC_SIZE * 2 - 1), %rdi
325# else
326 subq $-(VEC_SIZE * 2 + 1), %rdi
327# endif
328 addq %rdi, %rax
329 VZEROUPPER_RETURN
330# ifndef USE_AS_RAWMEMCHR
331
332 .p2align 4
333L(first_vec_x1_check):
334 tzcntl %eax, %eax
335 /* Adjust length. */
336 subl $-(VEC_SIZE * 4), %edx
337 /* Check if match within remaining length. */
338 cmpl %eax, %edx
339 jbe L(set_zero_end)
340 incq %rdi
341 addq %rdi, %rax
342 VZEROUPPER_RETURN
343 .p2align 4
344L(set_zero_end):
345 xorl %eax, %eax
346 VZEROUPPER_RETURN
347# endif
348
349 .p2align 4
350L(last_vec_x1_return):
351 tzcntl %eax, %eax
352# ifdef USE_AS_RAWMEMCHR
353 subq $(VEC_SIZE * 4 - 1), %rdi
354# else
355 incq %rdi
356# endif
357 addq %rdi, %rax
358 VZEROUPPER_RETURN
359
360 .p2align 4
361L(last_vec_x2_return):
362 tzcntl %eax, %eax
363# ifdef USE_AS_RAWMEMCHR
364 subq $(VEC_SIZE * 3 - 1), %rdi
365# else
366 subq $-(VEC_SIZE + 1), %rdi
367# endif
368 addq %rdi, %rax
369 VZEROUPPER_RETURN
370
371# ifndef USE_AS_RAWMEMCHR
372 .p2align 4
373L(last_4x_vec_or_less_cmpeq):
374 VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
375 vpmovmskb %ymm1, %eax
376# ifdef USE_AS_WMEMCHR
377 /* NB: Multiply length by 4 to get byte count. */
378 sall $2, %edx
379# endif
380 subq $-(VEC_SIZE * 4), %rdi
381 /* Check first VEC regardless. */
382 testl %eax, %eax
383 jnz L(first_vec_x1_check)
384
385 /* If remaining length <= CHAR_PER_VEC * 2. */
386 addl $(VEC_SIZE * 2), %edx
387 jle L(last_2x_vec)
388 .p2align 4
389L(last_4x_vec):
390 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
391 vpmovmskb %ymm1, %eax
392 testl %eax, %eax
393 jnz L(last_vec_x2_return)
394
395 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
396 vpmovmskb %ymm1, %eax
397
398 /* Create mask for possible matches within remaining length. */
399 movq $-1, %rcx
400 bzhiq %rdx, %rcx, %rcx
401
402 /* Test matches in data against length match. */
403 andl %ecx, %eax
404 jnz L(last_vec_x3)
405
406 /* if remaining length <= VEC_SIZE * 3 (Note this is after
407 remaining length was found to be > VEC_SIZE * 2. */
408 subl $VEC_SIZE, %edx
409 jbe L(zero_end2)
410
411 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
412 vpmovmskb %ymm1, %eax
413 /* Shift remaining length mask for last VEC. */
414 shrq $32, %rcx
415 andl %ecx, %eax
416 jz L(zero_end2)
417 tzcntl %eax, %eax
418 addq $(VEC_SIZE * 3 + 1), %rdi
419 addq %rdi, %rax
420L(zero_end2):
421 VZEROUPPER_RETURN
422
423 .p2align 4
424L(last_vec_x3):
425 tzcntl %eax, %eax
426 subq $-(VEC_SIZE * 2 + 1), %rdi
427 addq %rdi, %rax
428 VZEROUPPER_RETURN
429# endif
430
431END (MEMCHR)
432#endif
433