1 | /* memchr/wmemchr optimized with AVX2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | #include <sysdep.h> |
21 | |
22 | #if ISA_SHOULD_BUILD (3) |
23 | |
24 | # ifndef MEMCHR |
25 | # define MEMCHR __memchr_avx2 |
26 | # endif |
27 | |
28 | # ifdef USE_AS_WMEMCHR |
29 | # define VPCMPEQ vpcmpeqd |
30 | # define VPBROADCAST vpbroadcastd |
31 | # define CHAR_SIZE 4 |
32 | # else |
33 | # define VPCMPEQ vpcmpeqb |
34 | # define VPBROADCAST vpbroadcastb |
35 | # define CHAR_SIZE 1 |
36 | # endif |
37 | |
38 | # ifdef USE_AS_RAWMEMCHR |
39 | # define ERAW_PTR_REG ecx |
40 | # define RRAW_PTR_REG rcx |
41 | # define ALGN_PTR_REG rdi |
42 | # else |
43 | # define ERAW_PTR_REG edi |
44 | # define RRAW_PTR_REG rdi |
45 | # define ALGN_PTR_REG rcx |
46 | # endif |
47 | |
48 | # ifndef VZEROUPPER |
49 | # define VZEROUPPER vzeroupper |
50 | # endif |
51 | |
52 | # ifndef SECTION |
53 | # define SECTION(p) p##.avx |
54 | # endif |
55 | |
56 | # define VEC_SIZE 32 |
57 | # define PAGE_SIZE 4096 |
58 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
59 | |
60 | .section SECTION(.text),"ax" ,@progbits |
61 | ENTRY_P2ALIGN (MEMCHR, 5) |
62 | # ifndef USE_AS_RAWMEMCHR |
63 | /* Check for zero length. */ |
64 | # ifdef __ILP32__ |
65 | /* Clear upper bits. */ |
66 | and %RDX_LP, %RDX_LP |
67 | # else |
68 | test %RDX_LP, %RDX_LP |
69 | # endif |
70 | jz L(null) |
71 | # endif |
72 | /* Broadcast CHAR to YMMMATCH. */ |
73 | vmovd %esi, %xmm0 |
74 | VPBROADCAST %xmm0, %ymm0 |
75 | /* Check if we may cross page boundary with one vector load. */ |
76 | movl %edi, %eax |
77 | andl $(PAGE_SIZE - 1), %eax |
78 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
79 | ja L(cross_page_boundary) |
80 | |
81 | /* Check the first VEC_SIZE bytes. */ |
82 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
83 | vpmovmskb %ymm1, %eax |
84 | # ifndef USE_AS_RAWMEMCHR |
85 | /* If length < CHAR_PER_VEC handle special. */ |
86 | cmpq $CHAR_PER_VEC, %rdx |
87 | jbe L(first_vec_x0) |
88 | # endif |
89 | testl %eax, %eax |
90 | jz L(aligned_more) |
91 | bsfl %eax, %eax |
92 | addq %rdi, %rax |
93 | L(return_vzeroupper): |
94 | ZERO_UPPER_VEC_REGISTERS_RETURN |
95 | |
96 | |
97 | # ifndef USE_AS_RAWMEMCHR |
98 | .p2align 4 |
99 | L(first_vec_x0): |
100 | /* Check if first match was before length. */ |
101 | tzcntl %eax, %eax |
102 | # ifdef USE_AS_WMEMCHR |
103 | /* NB: Multiply length by 4 to get byte count. */ |
104 | sall $2, %edx |
105 | # endif |
106 | COND_VZEROUPPER |
107 | /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch |
108 | block. branch here as opposed to cmovcc is not that costly. Common |
109 | usage of memchr is to check if the return was NULL (if string was |
110 | known to contain CHAR user would use rawmemchr). This branch will be |
111 | highly correlated with the user branch and can be used by most |
112 | modern branch predictors to predict the user branch. */ |
113 | cmpl %eax, %edx |
114 | jle L(null) |
115 | addq %rdi, %rax |
116 | ret |
117 | # endif |
118 | |
119 | .p2align 4,, 10 |
120 | L(first_vec_x1): |
121 | bsfl %eax, %eax |
122 | incq %rdi |
123 | addq %rdi, %rax |
124 | VZEROUPPER_RETURN |
125 | # ifndef USE_AS_RAWMEMCHR |
126 | /* First in aligning bytes here. */ |
127 | L(null): |
128 | xorl %eax, %eax |
129 | ret |
130 | # endif |
131 | .p2align 4 |
132 | L(first_vec_x2): |
133 | tzcntl %eax, %eax |
134 | addq $(VEC_SIZE + 1), %rdi |
135 | addq %rdi, %rax |
136 | VZEROUPPER_RETURN |
137 | |
138 | .p2align 4 |
139 | L(first_vec_x3): |
140 | tzcntl %eax, %eax |
141 | addq $(VEC_SIZE * 2 + 1), %rdi |
142 | addq %rdi, %rax |
143 | VZEROUPPER_RETURN |
144 | |
145 | |
146 | .p2align 4 |
147 | L(first_vec_x4): |
148 | tzcntl %eax, %eax |
149 | addq $(VEC_SIZE * 3 + 1), %rdi |
150 | addq %rdi, %rax |
151 | VZEROUPPER_RETURN |
152 | |
153 | .p2align 4 |
154 | L(aligned_more): |
155 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
156 | since data is only aligned to VEC_SIZE. */ |
157 | |
158 | # ifndef USE_AS_RAWMEMCHR |
159 | L(cross_page_continue): |
160 | /* Align data to VEC_SIZE - 1. */ |
161 | xorl %ecx, %ecx |
162 | subl %edi, %ecx |
163 | orq $(VEC_SIZE - 1), %rdi |
164 | /* esi is for adjusting length to see if near the end. */ |
165 | leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi |
166 | # ifdef USE_AS_WMEMCHR |
167 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
168 | sarl $2, %esi |
169 | # endif |
170 | # else |
171 | orq $(VEC_SIZE - 1), %rdi |
172 | L(cross_page_continue): |
173 | # endif |
174 | /* Load first VEC regardless. */ |
175 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
176 | vpmovmskb %ymm1, %eax |
177 | # ifndef USE_AS_RAWMEMCHR |
178 | /* Adjust length. If near end handle specially. */ |
179 | subq %rsi, %rdx |
180 | jbe L(last_4x_vec_or_less) |
181 | # endif |
182 | testl %eax, %eax |
183 | jnz L(first_vec_x1) |
184 | |
185 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
186 | vpmovmskb %ymm1, %eax |
187 | testl %eax, %eax |
188 | jnz L(first_vec_x2) |
189 | |
190 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
191 | vpmovmskb %ymm1, %eax |
192 | testl %eax, %eax |
193 | jnz L(first_vec_x3) |
194 | |
195 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
196 | vpmovmskb %ymm1, %eax |
197 | testl %eax, %eax |
198 | jnz L(first_vec_x4) |
199 | |
200 | # ifndef USE_AS_RAWMEMCHR |
201 | /* Check if at last VEC_SIZE * 4 length. */ |
202 | subq $(CHAR_PER_VEC * 4), %rdx |
203 | jbe L(last_4x_vec_or_less_cmpeq) |
204 | /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust |
205 | length. */ |
206 | incq %rdi |
207 | movl %edi, %ecx |
208 | orq $(VEC_SIZE * 4 - 1), %rdi |
209 | andl $(VEC_SIZE * 4 - 1), %ecx |
210 | # ifdef USE_AS_WMEMCHR |
211 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
212 | sarl $2, %ecx |
213 | # endif |
214 | addq %rcx, %rdx |
215 | # else |
216 | /* Align data to VEC_SIZE * 4 - 1 for loop. */ |
217 | incq %rdi |
218 | orq $(VEC_SIZE * 4 - 1), %rdi |
219 | # endif |
220 | |
221 | /* Compare 4 * VEC at a time forward. */ |
222 | .p2align 4 |
223 | L(loop_4x_vec): |
224 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
225 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 |
226 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 |
227 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 |
228 | vpor %ymm1, %ymm2, %ymm5 |
229 | vpor %ymm3, %ymm4, %ymm6 |
230 | vpor %ymm5, %ymm6, %ymm5 |
231 | |
232 | vpmovmskb %ymm5, %ecx |
233 | # ifdef USE_AS_RAWMEMCHR |
234 | subq $-(VEC_SIZE * 4), %rdi |
235 | testl %ecx, %ecx |
236 | jz L(loop_4x_vec) |
237 | # else |
238 | testl %ecx, %ecx |
239 | jnz L(loop_4x_vec_end) |
240 | |
241 | subq $-(VEC_SIZE * 4), %rdi |
242 | |
243 | subq $(CHAR_PER_VEC * 4), %rdx |
244 | ja L(loop_4x_vec) |
245 | |
246 | /* Fall through into less than 4 remaining vectors of length |
247 | case. */ |
248 | VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 |
249 | vpmovmskb %ymm1, %eax |
250 | .p2align 4 |
251 | L(last_4x_vec_or_less): |
252 | # ifdef USE_AS_WMEMCHR |
253 | /* NB: Multiply length by 4 to get byte count. */ |
254 | sall $2, %edx |
255 | # endif |
256 | /* Check if first VEC contained match. */ |
257 | testl %eax, %eax |
258 | jnz L(first_vec_x1_check) |
259 | |
260 | /* If remaining length > VEC_SIZE * 2. */ |
261 | addl $(VEC_SIZE * 2), %edx |
262 | jg L(last_4x_vec) |
263 | |
264 | L(last_2x_vec): |
265 | /* If remaining length < VEC_SIZE. */ |
266 | addl $VEC_SIZE, %edx |
267 | jle L(zero_end) |
268 | |
269 | /* Check VEC2 and compare any match with remaining length. */ |
270 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
271 | vpmovmskb %ymm1, %eax |
272 | tzcntl %eax, %eax |
273 | cmpl %eax, %edx |
274 | jbe L(set_zero_end) |
275 | addq $(VEC_SIZE + 1), %rdi |
276 | addq %rdi, %rax |
277 | L(zero_end): |
278 | VZEROUPPER_RETURN |
279 | |
280 | .p2align 4 |
281 | L(loop_4x_vec_end): |
282 | # endif |
283 | /* rawmemchr will fall through into this if match was found in |
284 | loop. */ |
285 | |
286 | vpmovmskb %ymm1, %eax |
287 | testl %eax, %eax |
288 | jnz L(last_vec_x1_return) |
289 | |
290 | vpmovmskb %ymm2, %eax |
291 | testl %eax, %eax |
292 | jnz L(last_vec_x2_return) |
293 | |
294 | vpmovmskb %ymm3, %eax |
295 | /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ |
296 | salq $32, %rcx |
297 | orq %rcx, %rax |
298 | tzcntq %rax, %rax |
299 | # ifdef USE_AS_RAWMEMCHR |
300 | subq $(VEC_SIZE * 2 - 1), %rdi |
301 | # else |
302 | subq $-(VEC_SIZE * 2 + 1), %rdi |
303 | # endif |
304 | addq %rdi, %rax |
305 | VZEROUPPER_RETURN |
306 | # ifndef USE_AS_RAWMEMCHR |
307 | |
308 | .p2align 4 |
309 | L(first_vec_x1_check): |
310 | tzcntl %eax, %eax |
311 | /* Adjust length. */ |
312 | subl $-(VEC_SIZE * 4), %edx |
313 | /* Check if match within remaining length. */ |
314 | cmpl %eax, %edx |
315 | jbe L(set_zero_end) |
316 | incq %rdi |
317 | addq %rdi, %rax |
318 | VZEROUPPER_RETURN |
319 | .p2align 4,, 6 |
320 | L(set_zero_end): |
321 | xorl %eax, %eax |
322 | VZEROUPPER_RETURN |
323 | # endif |
324 | |
325 | .p2align 4 |
326 | L(last_vec_x1_return): |
327 | tzcntl %eax, %eax |
328 | # ifdef USE_AS_RAWMEMCHR |
329 | subq $(VEC_SIZE * 4 - 1), %rdi |
330 | # else |
331 | incq %rdi |
332 | # endif |
333 | addq %rdi, %rax |
334 | VZEROUPPER_RETURN |
335 | |
336 | .p2align 4 |
337 | L(last_vec_x2_return): |
338 | tzcntl %eax, %eax |
339 | # ifdef USE_AS_RAWMEMCHR |
340 | subq $(VEC_SIZE * 3 - 1), %rdi |
341 | # else |
342 | subq $-(VEC_SIZE + 1), %rdi |
343 | # endif |
344 | addq %rdi, %rax |
345 | VZEROUPPER_RETURN |
346 | |
347 | # ifndef USE_AS_RAWMEMCHR |
348 | .p2align 4 |
349 | L(last_4x_vec_or_less_cmpeq): |
350 | VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 |
351 | vpmovmskb %ymm1, %eax |
352 | # ifdef USE_AS_WMEMCHR |
353 | /* NB: Multiply length by 4 to get byte count. */ |
354 | sall $2, %edx |
355 | # endif |
356 | subq $-(VEC_SIZE * 4), %rdi |
357 | /* Check first VEC regardless. */ |
358 | testl %eax, %eax |
359 | jnz L(first_vec_x1_check) |
360 | |
361 | /* If remaining length <= CHAR_PER_VEC * 2. */ |
362 | addl $(VEC_SIZE * 2), %edx |
363 | jle L(last_2x_vec) |
364 | .p2align 4 |
365 | L(last_4x_vec): |
366 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
367 | vpmovmskb %ymm1, %eax |
368 | testl %eax, %eax |
369 | jnz L(last_vec_x2_return) |
370 | |
371 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
372 | vpmovmskb %ymm1, %eax |
373 | |
374 | /* Create mask for possible matches within remaining length. */ |
375 | movq $-1, %rcx |
376 | bzhiq %rdx, %rcx, %rcx |
377 | |
378 | /* Test matches in data against length match. */ |
379 | andl %ecx, %eax |
380 | jnz L(last_vec_x3) |
381 | |
382 | /* if remaining length <= VEC_SIZE * 3 (Note this is after |
383 | remaining length was found to be > VEC_SIZE * 2. */ |
384 | subl $VEC_SIZE, %edx |
385 | jbe L(zero_end2) |
386 | |
387 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
388 | vpmovmskb %ymm1, %eax |
389 | /* Shift remaining length mask for last VEC. */ |
390 | shrq $32, %rcx |
391 | andl %ecx, %eax |
392 | jz L(zero_end2) |
393 | tzcntl %eax, %eax |
394 | addq $(VEC_SIZE * 3 + 1), %rdi |
395 | addq %rdi, %rax |
396 | L(zero_end2): |
397 | VZEROUPPER_RETURN |
398 | |
399 | .p2align 4 |
400 | L(last_vec_x3): |
401 | tzcntl %eax, %eax |
402 | subq $-(VEC_SIZE * 2 + 1), %rdi |
403 | addq %rdi, %rax |
404 | VZEROUPPER_RETURN |
405 | # endif |
406 | |
407 | .p2align 4 |
408 | L(cross_page_boundary): |
409 | /* Save pointer before aligning as its original value is necessary for |
410 | computer return address if byte is found or adjusting length if it |
411 | is not and this is memchr. */ |
412 | movq %rdi, %rcx |
413 | /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr |
414 | and rdi for rawmemchr. */ |
415 | orq $(VEC_SIZE - 1), %ALGN_PTR_REG |
416 | VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 |
417 | vpmovmskb %ymm1, %eax |
418 | # ifndef USE_AS_RAWMEMCHR |
419 | /* Calculate length until end of page (length checked for a match). */ |
420 | leaq 1(%ALGN_PTR_REG), %rsi |
421 | subq %RRAW_PTR_REG, %rsi |
422 | # ifdef USE_AS_WMEMCHR |
423 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
424 | shrl $2, %esi |
425 | # endif |
426 | # endif |
427 | /* Remove the leading bytes. */ |
428 | sarxl %ERAW_PTR_REG, %eax, %eax |
429 | # ifndef USE_AS_RAWMEMCHR |
430 | /* Check the end of data. */ |
431 | cmpq %rsi, %rdx |
432 | jbe L(first_vec_x0) |
433 | # endif |
434 | testl %eax, %eax |
435 | jz L(cross_page_continue) |
436 | bsfl %eax, %eax |
437 | addq %RRAW_PTR_REG, %rax |
438 | VZEROUPPER_RETURN |
439 | |
440 | |
441 | END (MEMCHR) |
442 | #endif |
443 | |