1 | /* memchr/wmemchr optimized with AVX2. |
2 | Copyright (C) 2017-2018 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef MEMCHR |
24 | # define MEMCHR __memchr_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WMEMCHR |
28 | # define VPCMPEQ vpcmpeqd |
29 | # else |
30 | # define VPCMPEQ vpcmpeqb |
31 | # endif |
32 | |
33 | # ifndef VZEROUPPER |
34 | # define VZEROUPPER vzeroupper |
35 | # endif |
36 | |
37 | # define VEC_SIZE 32 |
38 | |
39 | .section .text.avx,"ax" ,@progbits |
40 | ENTRY (MEMCHR) |
41 | # ifndef USE_AS_RAWMEMCHR |
42 | /* Check for zero length. */ |
43 | testq %rdx, %rdx |
44 | jz L(null) |
45 | # endif |
46 | movl %edi, %ecx |
47 | /* Broadcast CHAR to YMM0. */ |
48 | vmovd %esi, %xmm0 |
49 | # ifdef USE_AS_WMEMCHR |
50 | shl $2, %rdx |
51 | vpbroadcastd %xmm0, %ymm0 |
52 | # else |
53 | vpbroadcastb %xmm0, %ymm0 |
54 | # endif |
55 | /* Check if we may cross page boundary with one vector load. */ |
56 | andl $(2 * VEC_SIZE - 1), %ecx |
57 | cmpl $VEC_SIZE, %ecx |
58 | ja L(cros_page_boundary) |
59 | |
60 | /* Check the first VEC_SIZE bytes. */ |
61 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
62 | vpmovmskb %ymm1, %eax |
63 | testl %eax, %eax |
64 | |
65 | # ifndef USE_AS_RAWMEMCHR |
66 | jnz L(first_vec_x0_check) |
67 | /* Adjust length and check the end of data. */ |
68 | subq $VEC_SIZE, %rdx |
69 | jbe L(zero) |
70 | # else |
71 | jnz L(first_vec_x0) |
72 | # endif |
73 | |
74 | /* Align data for aligned loads in the loop. */ |
75 | addq $VEC_SIZE, %rdi |
76 | andl $(VEC_SIZE - 1), %ecx |
77 | andq $-VEC_SIZE, %rdi |
78 | |
79 | # ifndef USE_AS_RAWMEMCHR |
80 | /* Adjust length. */ |
81 | addq %rcx, %rdx |
82 | |
83 | subq $(VEC_SIZE * 4), %rdx |
84 | jbe L(last_4x_vec_or_less) |
85 | # endif |
86 | jmp L(more_4x_vec) |
87 | |
88 | .p2align 4 |
89 | L(cros_page_boundary): |
90 | andl $(VEC_SIZE - 1), %ecx |
91 | andq $-VEC_SIZE, %rdi |
92 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
93 | vpmovmskb %ymm1, %eax |
94 | /* Remove the leading bytes. */ |
95 | sarl %cl, %eax |
96 | testl %eax, %eax |
97 | jz L(aligned_more) |
98 | tzcntl %eax, %eax |
99 | # ifndef USE_AS_RAWMEMCHR |
100 | /* Check the end of data. */ |
101 | cmpq %rax, %rdx |
102 | jbe L(zero) |
103 | # endif |
104 | addq %rdi, %rax |
105 | addq %rcx, %rax |
106 | VZEROUPPER |
107 | ret |
108 | |
109 | .p2align 4 |
110 | L(aligned_more): |
111 | # ifndef USE_AS_RAWMEMCHR |
112 | /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" |
113 | instead of "(rdx + rcx) - VEC_SIZE" to void possible addition |
114 | overflow. */ |
115 | negq %rcx |
116 | addq $VEC_SIZE, %rcx |
117 | |
118 | /* Check the end of data. */ |
119 | subq %rcx, %rdx |
120 | jbe L(zero) |
121 | # endif |
122 | |
123 | addq $VEC_SIZE, %rdi |
124 | |
125 | # ifndef USE_AS_RAWMEMCHR |
126 | subq $(VEC_SIZE * 4), %rdx |
127 | jbe L(last_4x_vec_or_less) |
128 | # endif |
129 | |
130 | L(more_4x_vec): |
131 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
132 | since data is only aligned to VEC_SIZE. */ |
133 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
134 | vpmovmskb %ymm1, %eax |
135 | testl %eax, %eax |
136 | jnz L(first_vec_x0) |
137 | |
138 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
139 | vpmovmskb %ymm1, %eax |
140 | testl %eax, %eax |
141 | jnz L(first_vec_x1) |
142 | |
143 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
144 | vpmovmskb %ymm1, %eax |
145 | testl %eax, %eax |
146 | jnz L(first_vec_x2) |
147 | |
148 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
149 | vpmovmskb %ymm1, %eax |
150 | testl %eax, %eax |
151 | jnz L(first_vec_x3) |
152 | |
153 | addq $(VEC_SIZE * 4), %rdi |
154 | |
155 | # ifndef USE_AS_RAWMEMCHR |
156 | subq $(VEC_SIZE * 4), %rdx |
157 | jbe L(last_4x_vec_or_less) |
158 | # endif |
159 | |
160 | /* Align data to 4 * VEC_SIZE. */ |
161 | movq %rdi, %rcx |
162 | andl $(4 * VEC_SIZE - 1), %ecx |
163 | andq $-(4 * VEC_SIZE), %rdi |
164 | |
165 | # ifndef USE_AS_RAWMEMCHR |
166 | /* Adjust length. */ |
167 | addq %rcx, %rdx |
168 | # endif |
169 | |
170 | .p2align 4 |
171 | L(loop_4x_vec): |
172 | /* Compare 4 * VEC at a time forward. */ |
173 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
174 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 |
175 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 |
176 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 |
177 | |
178 | vpor %ymm1, %ymm2, %ymm5 |
179 | vpor %ymm3, %ymm4, %ymm6 |
180 | vpor %ymm5, %ymm6, %ymm5 |
181 | |
182 | vpmovmskb %ymm5, %eax |
183 | testl %eax, %eax |
184 | jnz L(4x_vec_end) |
185 | |
186 | addq $(VEC_SIZE * 4), %rdi |
187 | |
188 | # ifdef USE_AS_RAWMEMCHR |
189 | jmp L(loop_4x_vec) |
190 | # else |
191 | subq $(VEC_SIZE * 4), %rdx |
192 | ja L(loop_4x_vec) |
193 | |
194 | L(last_4x_vec_or_less): |
195 | /* Less than 4 * VEC and aligned to VEC_SIZE. */ |
196 | addl $(VEC_SIZE * 2), %edx |
197 | jle L(last_2x_vec) |
198 | |
199 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
200 | vpmovmskb %ymm1, %eax |
201 | testl %eax, %eax |
202 | jnz L(first_vec_x0) |
203 | |
204 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
205 | vpmovmskb %ymm1, %eax |
206 | testl %eax, %eax |
207 | jnz L(first_vec_x1) |
208 | |
209 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
210 | vpmovmskb %ymm1, %eax |
211 | testl %eax, %eax |
212 | |
213 | jnz L(first_vec_x2_check) |
214 | subl $VEC_SIZE, %edx |
215 | jle L(zero) |
216 | |
217 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
218 | vpmovmskb %ymm1, %eax |
219 | testl %eax, %eax |
220 | |
221 | jnz L(first_vec_x3_check) |
222 | xorl %eax, %eax |
223 | VZEROUPPER |
224 | ret |
225 | |
226 | .p2align 4 |
227 | L(last_2x_vec): |
228 | addl $(VEC_SIZE * 2), %edx |
229 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
230 | vpmovmskb %ymm1, %eax |
231 | testl %eax, %eax |
232 | |
233 | jnz L(first_vec_x0_check) |
234 | subl $VEC_SIZE, %edx |
235 | jle L(zero) |
236 | |
237 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
238 | vpmovmskb %ymm1, %eax |
239 | testl %eax, %eax |
240 | jnz L(first_vec_x1_check) |
241 | xorl %eax, %eax |
242 | VZEROUPPER |
243 | ret |
244 | |
245 | .p2align 4 |
246 | L(first_vec_x0_check): |
247 | tzcntl %eax, %eax |
248 | /* Check the end of data. */ |
249 | cmpq %rax, %rdx |
250 | jbe L(zero) |
251 | addq %rdi, %rax |
252 | VZEROUPPER |
253 | ret |
254 | |
255 | .p2align 4 |
256 | L(first_vec_x1_check): |
257 | tzcntl %eax, %eax |
258 | /* Check the end of data. */ |
259 | cmpq %rax, %rdx |
260 | jbe L(zero) |
261 | addq $VEC_SIZE, %rax |
262 | addq %rdi, %rax |
263 | VZEROUPPER |
264 | ret |
265 | |
266 | .p2align 4 |
267 | L(first_vec_x2_check): |
268 | tzcntl %eax, %eax |
269 | /* Check the end of data. */ |
270 | cmpq %rax, %rdx |
271 | jbe L(zero) |
272 | addq $(VEC_SIZE * 2), %rax |
273 | addq %rdi, %rax |
274 | VZEROUPPER |
275 | ret |
276 | |
277 | .p2align 4 |
278 | L(first_vec_x3_check): |
279 | tzcntl %eax, %eax |
280 | /* Check the end of data. */ |
281 | cmpq %rax, %rdx |
282 | jbe L(zero) |
283 | addq $(VEC_SIZE * 3), %rax |
284 | addq %rdi, %rax |
285 | VZEROUPPER |
286 | ret |
287 | |
288 | .p2align 4 |
289 | L(zero): |
290 | VZEROUPPER |
291 | L(null): |
292 | xorl %eax, %eax |
293 | ret |
294 | # endif |
295 | |
296 | .p2align 4 |
297 | L(first_vec_x0): |
298 | tzcntl %eax, %eax |
299 | addq %rdi, %rax |
300 | VZEROUPPER |
301 | ret |
302 | |
303 | .p2align 4 |
304 | L(first_vec_x1): |
305 | tzcntl %eax, %eax |
306 | addq $VEC_SIZE, %rax |
307 | addq %rdi, %rax |
308 | VZEROUPPER |
309 | ret |
310 | |
311 | .p2align 4 |
312 | L(first_vec_x2): |
313 | tzcntl %eax, %eax |
314 | addq $(VEC_SIZE * 2), %rax |
315 | addq %rdi, %rax |
316 | VZEROUPPER |
317 | ret |
318 | |
319 | .p2align 4 |
320 | L(4x_vec_end): |
321 | vpmovmskb %ymm1, %eax |
322 | testl %eax, %eax |
323 | jnz L(first_vec_x0) |
324 | vpmovmskb %ymm2, %eax |
325 | testl %eax, %eax |
326 | jnz L(first_vec_x1) |
327 | vpmovmskb %ymm3, %eax |
328 | testl %eax, %eax |
329 | jnz L(first_vec_x2) |
330 | vpmovmskb %ymm4, %eax |
331 | testl %eax, %eax |
332 | L(first_vec_x3): |
333 | tzcntl %eax, %eax |
334 | addq $(VEC_SIZE * 3), %rax |
335 | addq %rdi, %rax |
336 | VZEROUPPER |
337 | ret |
338 | |
339 | END (MEMCHR) |
340 | #endif |
341 | |