1 | /* memchr/wmemchr optimized with AVX2. |
2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef MEMCHR |
24 | # define MEMCHR __memchr_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WMEMCHR |
28 | # define VPCMPEQ vpcmpeqd |
29 | # define VPBROADCAST vpbroadcastd |
30 | # define CHAR_SIZE 4 |
31 | # else |
32 | # define VPCMPEQ vpcmpeqb |
33 | # define VPBROADCAST vpbroadcastb |
34 | # define CHAR_SIZE 1 |
35 | # endif |
36 | |
37 | # ifdef USE_AS_RAWMEMCHR |
38 | # define ERAW_PTR_REG ecx |
39 | # define RRAW_PTR_REG rcx |
40 | # define ALGN_PTR_REG rdi |
41 | # else |
42 | # define ERAW_PTR_REG edi |
43 | # define RRAW_PTR_REG rdi |
44 | # define ALGN_PTR_REG rcx |
45 | # endif |
46 | |
47 | # ifndef VZEROUPPER |
48 | # define VZEROUPPER vzeroupper |
49 | # endif |
50 | |
51 | # ifndef SECTION |
52 | # define SECTION(p) p##.avx |
53 | # endif |
54 | |
55 | # define VEC_SIZE 32 |
56 | # define PAGE_SIZE 4096 |
57 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
58 | |
59 | .section SECTION(.text),"ax" ,@progbits |
60 | ENTRY (MEMCHR) |
61 | # ifndef USE_AS_RAWMEMCHR |
62 | /* Check for zero length. */ |
63 | # ifdef __ILP32__ |
64 | /* Clear upper bits. */ |
65 | and %RDX_LP, %RDX_LP |
66 | # else |
67 | test %RDX_LP, %RDX_LP |
68 | # endif |
69 | jz L(null) |
70 | # endif |
71 | /* Broadcast CHAR to YMMMATCH. */ |
72 | vmovd %esi, %xmm0 |
73 | VPBROADCAST %xmm0, %ymm0 |
74 | /* Check if we may cross page boundary with one vector load. */ |
75 | movl %edi, %eax |
76 | andl $(PAGE_SIZE - 1), %eax |
77 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
78 | ja L(cross_page_boundary) |
79 | |
80 | /* Check the first VEC_SIZE bytes. */ |
81 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
82 | vpmovmskb %ymm1, %eax |
83 | # ifndef USE_AS_RAWMEMCHR |
84 | /* If length < CHAR_PER_VEC handle special. */ |
85 | cmpq $CHAR_PER_VEC, %rdx |
86 | jbe L(first_vec_x0) |
87 | # endif |
88 | testl %eax, %eax |
89 | jz L(aligned_more) |
90 | tzcntl %eax, %eax |
91 | addq %rdi, %rax |
92 | VZEROUPPER_RETURN |
93 | |
94 | # ifndef USE_AS_RAWMEMCHR |
95 | .p2align 5 |
96 | L(first_vec_x0): |
97 | /* Check if first match was before length. */ |
98 | tzcntl %eax, %eax |
99 | # ifdef USE_AS_WMEMCHR |
100 | /* NB: Multiply length by 4 to get byte count. */ |
101 | sall $2, %edx |
102 | # endif |
103 | xorl %ecx, %ecx |
104 | cmpl %eax, %edx |
105 | leaq (%rdi, %rax), %rax |
106 | cmovle %rcx, %rax |
107 | VZEROUPPER_RETURN |
108 | |
109 | L(null): |
110 | xorl %eax, %eax |
111 | ret |
112 | # endif |
113 | .p2align 4 |
114 | L(cross_page_boundary): |
115 | /* Save pointer before aligning as its original value is |
116 | necessary for computer return address if byte is found or |
117 | adjusting length if it is not and this is memchr. */ |
118 | movq %rdi, %rcx |
119 | /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr |
120 | and rdi for rawmemchr. */ |
121 | orq $(VEC_SIZE - 1), %ALGN_PTR_REG |
122 | VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 |
123 | vpmovmskb %ymm1, %eax |
124 | # ifndef USE_AS_RAWMEMCHR |
125 | /* Calculate length until end of page (length checked for a |
126 | match). */ |
127 | leaq 1(%ALGN_PTR_REG), %rsi |
128 | subq %RRAW_PTR_REG, %rsi |
129 | # ifdef USE_AS_WMEMCHR |
130 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
131 | shrl $2, %esi |
132 | # endif |
133 | # endif |
134 | /* Remove the leading bytes. */ |
135 | sarxl %ERAW_PTR_REG, %eax, %eax |
136 | # ifndef USE_AS_RAWMEMCHR |
137 | /* Check the end of data. */ |
138 | cmpq %rsi, %rdx |
139 | jbe L(first_vec_x0) |
140 | # endif |
141 | testl %eax, %eax |
142 | jz L(cross_page_continue) |
143 | tzcntl %eax, %eax |
144 | addq %RRAW_PTR_REG, %rax |
145 | L(return_vzeroupper): |
146 | ZERO_UPPER_VEC_REGISTERS_RETURN |
147 | |
148 | .p2align 4 |
149 | L(first_vec_x1): |
150 | tzcntl %eax, %eax |
151 | incq %rdi |
152 | addq %rdi, %rax |
153 | VZEROUPPER_RETURN |
154 | |
155 | .p2align 4 |
156 | L(first_vec_x2): |
157 | tzcntl %eax, %eax |
158 | addq $(VEC_SIZE + 1), %rdi |
159 | addq %rdi, %rax |
160 | VZEROUPPER_RETURN |
161 | |
162 | .p2align 4 |
163 | L(first_vec_x3): |
164 | tzcntl %eax, %eax |
165 | addq $(VEC_SIZE * 2 + 1), %rdi |
166 | addq %rdi, %rax |
167 | VZEROUPPER_RETURN |
168 | |
169 | |
170 | .p2align 4 |
171 | L(first_vec_x4): |
172 | tzcntl %eax, %eax |
173 | addq $(VEC_SIZE * 3 + 1), %rdi |
174 | addq %rdi, %rax |
175 | VZEROUPPER_RETURN |
176 | |
177 | .p2align 4 |
178 | L(aligned_more): |
179 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
180 | since data is only aligned to VEC_SIZE. */ |
181 | |
182 | # ifndef USE_AS_RAWMEMCHR |
183 | L(cross_page_continue): |
184 | /* Align data to VEC_SIZE - 1. */ |
185 | xorl %ecx, %ecx |
186 | subl %edi, %ecx |
187 | orq $(VEC_SIZE - 1), %rdi |
188 | /* esi is for adjusting length to see if near the end. */ |
189 | leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi |
190 | # ifdef USE_AS_WMEMCHR |
191 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
192 | sarl $2, %esi |
193 | # endif |
194 | # else |
195 | orq $(VEC_SIZE - 1), %rdi |
196 | L(cross_page_continue): |
197 | # endif |
198 | /* Load first VEC regardless. */ |
199 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
200 | vpmovmskb %ymm1, %eax |
201 | # ifndef USE_AS_RAWMEMCHR |
202 | /* Adjust length. If near end handle specially. */ |
203 | subq %rsi, %rdx |
204 | jbe L(last_4x_vec_or_less) |
205 | # endif |
206 | testl %eax, %eax |
207 | jnz L(first_vec_x1) |
208 | |
209 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
210 | vpmovmskb %ymm1, %eax |
211 | testl %eax, %eax |
212 | jnz L(first_vec_x2) |
213 | |
214 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
215 | vpmovmskb %ymm1, %eax |
216 | testl %eax, %eax |
217 | jnz L(first_vec_x3) |
218 | |
219 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
220 | vpmovmskb %ymm1, %eax |
221 | testl %eax, %eax |
222 | jnz L(first_vec_x4) |
223 | |
224 | # ifndef USE_AS_RAWMEMCHR |
225 | /* Check if at last VEC_SIZE * 4 length. */ |
226 | subq $(CHAR_PER_VEC * 4), %rdx |
227 | jbe L(last_4x_vec_or_less_cmpeq) |
228 | /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust |
229 | length. */ |
230 | incq %rdi |
231 | movl %edi, %ecx |
232 | orq $(VEC_SIZE * 4 - 1), %rdi |
233 | andl $(VEC_SIZE * 4 - 1), %ecx |
234 | # ifdef USE_AS_WMEMCHR |
235 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
236 | sarl $2, %ecx |
237 | # endif |
238 | addq %rcx, %rdx |
239 | # else |
240 | /* Align data to VEC_SIZE * 4 - 1 for loop. */ |
241 | incq %rdi |
242 | orq $(VEC_SIZE * 4 - 1), %rdi |
243 | # endif |
244 | |
245 | /* Compare 4 * VEC at a time forward. */ |
246 | .p2align 4 |
247 | L(loop_4x_vec): |
248 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
249 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 |
250 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 |
251 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 |
252 | vpor %ymm1, %ymm2, %ymm5 |
253 | vpor %ymm3, %ymm4, %ymm6 |
254 | vpor %ymm5, %ymm6, %ymm5 |
255 | |
256 | vpmovmskb %ymm5, %ecx |
257 | # ifdef USE_AS_RAWMEMCHR |
258 | subq $-(VEC_SIZE * 4), %rdi |
259 | testl %ecx, %ecx |
260 | jz L(loop_4x_vec) |
261 | # else |
262 | testl %ecx, %ecx |
263 | jnz L(loop_4x_vec_end) |
264 | |
265 | subq $-(VEC_SIZE * 4), %rdi |
266 | |
267 | subq $(CHAR_PER_VEC * 4), %rdx |
268 | ja L(loop_4x_vec) |
269 | |
270 | /* Fall through into less than 4 remaining vectors of length |
271 | case. */ |
272 | VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 |
273 | vpmovmskb %ymm1, %eax |
274 | .p2align 4 |
275 | L(last_4x_vec_or_less): |
276 | # ifdef USE_AS_WMEMCHR |
277 | /* NB: Multiply length by 4 to get byte count. */ |
278 | sall $2, %edx |
279 | # endif |
280 | /* Check if first VEC contained match. */ |
281 | testl %eax, %eax |
282 | jnz L(first_vec_x1_check) |
283 | |
284 | /* If remaining length > VEC_SIZE * 2. */ |
285 | addl $(VEC_SIZE * 2), %edx |
286 | jg L(last_4x_vec) |
287 | |
288 | L(last_2x_vec): |
289 | /* If remaining length < VEC_SIZE. */ |
290 | addl $VEC_SIZE, %edx |
291 | jle L(zero_end) |
292 | |
293 | /* Check VEC2 and compare any match with remaining length. */ |
294 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
295 | vpmovmskb %ymm1, %eax |
296 | tzcntl %eax, %eax |
297 | cmpl %eax, %edx |
298 | jbe L(set_zero_end) |
299 | addq $(VEC_SIZE + 1), %rdi |
300 | addq %rdi, %rax |
301 | L(zero_end): |
302 | VZEROUPPER_RETURN |
303 | |
304 | .p2align 4 |
305 | L(loop_4x_vec_end): |
306 | # endif |
307 | /* rawmemchr will fall through into this if match was found in |
308 | loop. */ |
309 | |
310 | vpmovmskb %ymm1, %eax |
311 | testl %eax, %eax |
312 | jnz L(last_vec_x1_return) |
313 | |
314 | vpmovmskb %ymm2, %eax |
315 | testl %eax, %eax |
316 | jnz L(last_vec_x2_return) |
317 | |
318 | vpmovmskb %ymm3, %eax |
319 | /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ |
320 | salq $32, %rcx |
321 | orq %rcx, %rax |
322 | tzcntq %rax, %rax |
323 | # ifdef USE_AS_RAWMEMCHR |
324 | subq $(VEC_SIZE * 2 - 1), %rdi |
325 | # else |
326 | subq $-(VEC_SIZE * 2 + 1), %rdi |
327 | # endif |
328 | addq %rdi, %rax |
329 | VZEROUPPER_RETURN |
330 | # ifndef USE_AS_RAWMEMCHR |
331 | |
332 | .p2align 4 |
333 | L(first_vec_x1_check): |
334 | tzcntl %eax, %eax |
335 | /* Adjust length. */ |
336 | subl $-(VEC_SIZE * 4), %edx |
337 | /* Check if match within remaining length. */ |
338 | cmpl %eax, %edx |
339 | jbe L(set_zero_end) |
340 | incq %rdi |
341 | addq %rdi, %rax |
342 | VZEROUPPER_RETURN |
343 | .p2align 4 |
344 | L(set_zero_end): |
345 | xorl %eax, %eax |
346 | VZEROUPPER_RETURN |
347 | # endif |
348 | |
349 | .p2align 4 |
350 | L(last_vec_x1_return): |
351 | tzcntl %eax, %eax |
352 | # ifdef USE_AS_RAWMEMCHR |
353 | subq $(VEC_SIZE * 4 - 1), %rdi |
354 | # else |
355 | incq %rdi |
356 | # endif |
357 | addq %rdi, %rax |
358 | VZEROUPPER_RETURN |
359 | |
360 | .p2align 4 |
361 | L(last_vec_x2_return): |
362 | tzcntl %eax, %eax |
363 | # ifdef USE_AS_RAWMEMCHR |
364 | subq $(VEC_SIZE * 3 - 1), %rdi |
365 | # else |
366 | subq $-(VEC_SIZE + 1), %rdi |
367 | # endif |
368 | addq %rdi, %rax |
369 | VZEROUPPER_RETURN |
370 | |
371 | # ifndef USE_AS_RAWMEMCHR |
372 | .p2align 4 |
373 | L(last_4x_vec_or_less_cmpeq): |
374 | VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 |
375 | vpmovmskb %ymm1, %eax |
376 | # ifdef USE_AS_WMEMCHR |
377 | /* NB: Multiply length by 4 to get byte count. */ |
378 | sall $2, %edx |
379 | # endif |
380 | subq $-(VEC_SIZE * 4), %rdi |
381 | /* Check first VEC regardless. */ |
382 | testl %eax, %eax |
383 | jnz L(first_vec_x1_check) |
384 | |
385 | /* If remaining length <= CHAR_PER_VEC * 2. */ |
386 | addl $(VEC_SIZE * 2), %edx |
387 | jle L(last_2x_vec) |
388 | .p2align 4 |
389 | L(last_4x_vec): |
390 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
391 | vpmovmskb %ymm1, %eax |
392 | testl %eax, %eax |
393 | jnz L(last_vec_x2_return) |
394 | |
395 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
396 | vpmovmskb %ymm1, %eax |
397 | |
398 | /* Create mask for possible matches within remaining length. */ |
399 | movq $-1, %rcx |
400 | bzhiq %rdx, %rcx, %rcx |
401 | |
402 | /* Test matches in data against length match. */ |
403 | andl %ecx, %eax |
404 | jnz L(last_vec_x3) |
405 | |
406 | /* if remaining length <= VEC_SIZE * 3 (Note this is after |
407 | remaining length was found to be > VEC_SIZE * 2. */ |
408 | subl $VEC_SIZE, %edx |
409 | jbe L(zero_end2) |
410 | |
411 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
412 | vpmovmskb %ymm1, %eax |
413 | /* Shift remaining length mask for last VEC. */ |
414 | shrq $32, %rcx |
415 | andl %ecx, %eax |
416 | jz L(zero_end2) |
417 | tzcntl %eax, %eax |
418 | addq $(VEC_SIZE * 3 + 1), %rdi |
419 | addq %rdi, %rax |
420 | L(zero_end2): |
421 | VZEROUPPER_RETURN |
422 | |
423 | .p2align 4 |
424 | L(last_vec_x3): |
425 | tzcntl %eax, %eax |
426 | subq $-(VEC_SIZE * 2 + 1), %rdi |
427 | addq %rdi, %rax |
428 | VZEROUPPER_RETURN |
429 | # endif |
430 | |
431 | END (MEMCHR) |
432 | #endif |
433 | |