1 | /* memchr/wmemchr optimized with AVX2. |
2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef MEMCHR |
24 | # define MEMCHR __memchr_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WMEMCHR |
28 | # define VPCMPEQ vpcmpeqd |
29 | # else |
30 | # define VPCMPEQ vpcmpeqb |
31 | # endif |
32 | |
33 | # ifndef VZEROUPPER |
34 | # define VZEROUPPER vzeroupper |
35 | # endif |
36 | |
37 | # define VEC_SIZE 32 |
38 | |
39 | .section .text.avx,"ax" ,@progbits |
40 | ENTRY (MEMCHR) |
41 | # ifndef USE_AS_RAWMEMCHR |
42 | /* Check for zero length. */ |
43 | test %RDX_LP, %RDX_LP |
44 | jz L(null) |
45 | # endif |
46 | movl %edi, %ecx |
47 | /* Broadcast CHAR to YMM0. */ |
48 | vmovd %esi, %xmm0 |
49 | # ifdef USE_AS_WMEMCHR |
50 | shl $2, %RDX_LP |
51 | vpbroadcastd %xmm0, %ymm0 |
52 | # else |
53 | # ifdef __ILP32__ |
54 | /* Clear the upper 32 bits. */ |
55 | movl %edx, %edx |
56 | # endif |
57 | vpbroadcastb %xmm0, %ymm0 |
58 | # endif |
59 | /* Check if we may cross page boundary with one vector load. */ |
60 | andl $(2 * VEC_SIZE - 1), %ecx |
61 | cmpl $VEC_SIZE, %ecx |
62 | ja L(cros_page_boundary) |
63 | |
64 | /* Check the first VEC_SIZE bytes. */ |
65 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
66 | vpmovmskb %ymm1, %eax |
67 | testl %eax, %eax |
68 | |
69 | # ifndef USE_AS_RAWMEMCHR |
70 | jnz L(first_vec_x0_check) |
71 | /* Adjust length and check the end of data. */ |
72 | subq $VEC_SIZE, %rdx |
73 | jbe L(zero) |
74 | # else |
75 | jnz L(first_vec_x0) |
76 | # endif |
77 | |
78 | /* Align data for aligned loads in the loop. */ |
79 | addq $VEC_SIZE, %rdi |
80 | andl $(VEC_SIZE - 1), %ecx |
81 | andq $-VEC_SIZE, %rdi |
82 | |
83 | # ifndef USE_AS_RAWMEMCHR |
84 | /* Adjust length. */ |
85 | addq %rcx, %rdx |
86 | |
87 | subq $(VEC_SIZE * 4), %rdx |
88 | jbe L(last_4x_vec_or_less) |
89 | # endif |
90 | jmp L(more_4x_vec) |
91 | |
92 | .p2align 4 |
93 | L(cros_page_boundary): |
94 | andl $(VEC_SIZE - 1), %ecx |
95 | andq $-VEC_SIZE, %rdi |
96 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
97 | vpmovmskb %ymm1, %eax |
98 | /* Remove the leading bytes. */ |
99 | sarl %cl, %eax |
100 | testl %eax, %eax |
101 | jz L(aligned_more) |
102 | tzcntl %eax, %eax |
103 | # ifndef USE_AS_RAWMEMCHR |
104 | /* Check the end of data. */ |
105 | cmpq %rax, %rdx |
106 | jbe L(zero) |
107 | # endif |
108 | addq %rdi, %rax |
109 | addq %rcx, %rax |
110 | VZEROUPPER |
111 | ret |
112 | |
113 | .p2align 4 |
114 | L(aligned_more): |
115 | # ifndef USE_AS_RAWMEMCHR |
116 | /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" |
117 | instead of "(rdx + rcx) - VEC_SIZE" to void possible addition |
118 | overflow. */ |
119 | negq %rcx |
120 | addq $VEC_SIZE, %rcx |
121 | |
122 | /* Check the end of data. */ |
123 | subq %rcx, %rdx |
124 | jbe L(zero) |
125 | # endif |
126 | |
127 | addq $VEC_SIZE, %rdi |
128 | |
129 | # ifndef USE_AS_RAWMEMCHR |
130 | subq $(VEC_SIZE * 4), %rdx |
131 | jbe L(last_4x_vec_or_less) |
132 | # endif |
133 | |
134 | L(more_4x_vec): |
135 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
136 | since data is only aligned to VEC_SIZE. */ |
137 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
138 | vpmovmskb %ymm1, %eax |
139 | testl %eax, %eax |
140 | jnz L(first_vec_x0) |
141 | |
142 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
143 | vpmovmskb %ymm1, %eax |
144 | testl %eax, %eax |
145 | jnz L(first_vec_x1) |
146 | |
147 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
148 | vpmovmskb %ymm1, %eax |
149 | testl %eax, %eax |
150 | jnz L(first_vec_x2) |
151 | |
152 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
153 | vpmovmskb %ymm1, %eax |
154 | testl %eax, %eax |
155 | jnz L(first_vec_x3) |
156 | |
157 | addq $(VEC_SIZE * 4), %rdi |
158 | |
159 | # ifndef USE_AS_RAWMEMCHR |
160 | subq $(VEC_SIZE * 4), %rdx |
161 | jbe L(last_4x_vec_or_less) |
162 | # endif |
163 | |
164 | /* Align data to 4 * VEC_SIZE. */ |
165 | movq %rdi, %rcx |
166 | andl $(4 * VEC_SIZE - 1), %ecx |
167 | andq $-(4 * VEC_SIZE), %rdi |
168 | |
169 | # ifndef USE_AS_RAWMEMCHR |
170 | /* Adjust length. */ |
171 | addq %rcx, %rdx |
172 | # endif |
173 | |
174 | .p2align 4 |
175 | L(loop_4x_vec): |
176 | /* Compare 4 * VEC at a time forward. */ |
177 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
178 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 |
179 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 |
180 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 |
181 | |
182 | vpor %ymm1, %ymm2, %ymm5 |
183 | vpor %ymm3, %ymm4, %ymm6 |
184 | vpor %ymm5, %ymm6, %ymm5 |
185 | |
186 | vpmovmskb %ymm5, %eax |
187 | testl %eax, %eax |
188 | jnz L(4x_vec_end) |
189 | |
190 | addq $(VEC_SIZE * 4), %rdi |
191 | |
192 | # ifdef USE_AS_RAWMEMCHR |
193 | jmp L(loop_4x_vec) |
194 | # else |
195 | subq $(VEC_SIZE * 4), %rdx |
196 | ja L(loop_4x_vec) |
197 | |
198 | L(last_4x_vec_or_less): |
199 | /* Less than 4 * VEC and aligned to VEC_SIZE. */ |
200 | addl $(VEC_SIZE * 2), %edx |
201 | jle L(last_2x_vec) |
202 | |
203 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
204 | vpmovmskb %ymm1, %eax |
205 | testl %eax, %eax |
206 | jnz L(first_vec_x0) |
207 | |
208 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
209 | vpmovmskb %ymm1, %eax |
210 | testl %eax, %eax |
211 | jnz L(first_vec_x1) |
212 | |
213 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
214 | vpmovmskb %ymm1, %eax |
215 | testl %eax, %eax |
216 | |
217 | jnz L(first_vec_x2_check) |
218 | subl $VEC_SIZE, %edx |
219 | jle L(zero) |
220 | |
221 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
222 | vpmovmskb %ymm1, %eax |
223 | testl %eax, %eax |
224 | |
225 | jnz L(first_vec_x3_check) |
226 | xorl %eax, %eax |
227 | VZEROUPPER |
228 | ret |
229 | |
230 | .p2align 4 |
231 | L(last_2x_vec): |
232 | addl $(VEC_SIZE * 2), %edx |
233 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
234 | vpmovmskb %ymm1, %eax |
235 | testl %eax, %eax |
236 | |
237 | jnz L(first_vec_x0_check) |
238 | subl $VEC_SIZE, %edx |
239 | jle L(zero) |
240 | |
241 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
242 | vpmovmskb %ymm1, %eax |
243 | testl %eax, %eax |
244 | jnz L(first_vec_x1_check) |
245 | xorl %eax, %eax |
246 | VZEROUPPER |
247 | ret |
248 | |
249 | .p2align 4 |
250 | L(first_vec_x0_check): |
251 | tzcntl %eax, %eax |
252 | /* Check the end of data. */ |
253 | cmpq %rax, %rdx |
254 | jbe L(zero) |
255 | addq %rdi, %rax |
256 | VZEROUPPER |
257 | ret |
258 | |
259 | .p2align 4 |
260 | L(first_vec_x1_check): |
261 | tzcntl %eax, %eax |
262 | /* Check the end of data. */ |
263 | cmpq %rax, %rdx |
264 | jbe L(zero) |
265 | addq $VEC_SIZE, %rax |
266 | addq %rdi, %rax |
267 | VZEROUPPER |
268 | ret |
269 | |
270 | .p2align 4 |
271 | L(first_vec_x2_check): |
272 | tzcntl %eax, %eax |
273 | /* Check the end of data. */ |
274 | cmpq %rax, %rdx |
275 | jbe L(zero) |
276 | addq $(VEC_SIZE * 2), %rax |
277 | addq %rdi, %rax |
278 | VZEROUPPER |
279 | ret |
280 | |
281 | .p2align 4 |
282 | L(first_vec_x3_check): |
283 | tzcntl %eax, %eax |
284 | /* Check the end of data. */ |
285 | cmpq %rax, %rdx |
286 | jbe L(zero) |
287 | addq $(VEC_SIZE * 3), %rax |
288 | addq %rdi, %rax |
289 | VZEROUPPER |
290 | ret |
291 | |
292 | .p2align 4 |
293 | L(zero): |
294 | VZEROUPPER |
295 | L(null): |
296 | xorl %eax, %eax |
297 | ret |
298 | # endif |
299 | |
300 | .p2align 4 |
301 | L(first_vec_x0): |
302 | tzcntl %eax, %eax |
303 | addq %rdi, %rax |
304 | VZEROUPPER |
305 | ret |
306 | |
307 | .p2align 4 |
308 | L(first_vec_x1): |
309 | tzcntl %eax, %eax |
310 | addq $VEC_SIZE, %rax |
311 | addq %rdi, %rax |
312 | VZEROUPPER |
313 | ret |
314 | |
315 | .p2align 4 |
316 | L(first_vec_x2): |
317 | tzcntl %eax, %eax |
318 | addq $(VEC_SIZE * 2), %rax |
319 | addq %rdi, %rax |
320 | VZEROUPPER |
321 | ret |
322 | |
323 | .p2align 4 |
324 | L(4x_vec_end): |
325 | vpmovmskb %ymm1, %eax |
326 | testl %eax, %eax |
327 | jnz L(first_vec_x0) |
328 | vpmovmskb %ymm2, %eax |
329 | testl %eax, %eax |
330 | jnz L(first_vec_x1) |
331 | vpmovmskb %ymm3, %eax |
332 | testl %eax, %eax |
333 | jnz L(first_vec_x2) |
334 | vpmovmskb %ymm4, %eax |
335 | testl %eax, %eax |
336 | L(first_vec_x3): |
337 | tzcntl %eax, %eax |
338 | addq $(VEC_SIZE * 3), %rax |
339 | addq %rdi, %rax |
340 | VZEROUPPER |
341 | ret |
342 | |
343 | END (MEMCHR) |
344 | #endif |
345 | |