1/* memchr/wmemchr optimized with AVX2.
2 Copyright (C) 2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCHR
24# define MEMCHR __memchr_avx2
25# endif
26
27# ifdef USE_AS_WMEMCHR
28# define VPCMPEQ vpcmpeqd
29# else
30# define VPCMPEQ vpcmpeqb
31# endif
32
33# ifndef VZEROUPPER
34# define VZEROUPPER vzeroupper
35# endif
36
37# define VEC_SIZE 32
38
39 .section .text.avx,"ax",@progbits
40ENTRY (MEMCHR)
41# ifndef USE_AS_RAWMEMCHR
42 /* Check for zero length. */
43 testq %rdx, %rdx
44 jz L(null)
45# endif
46 movl %edi, %ecx
47 /* Broadcast CHAR to YMM0. */
48 vmovd %esi, %xmm0
49# ifdef USE_AS_WMEMCHR
50 shl $2, %rdx
51 vpbroadcastd %xmm0, %ymm0
52# else
53 vpbroadcastb %xmm0, %ymm0
54# endif
55 /* Check if we may cross page boundary with one vector load. */
56 andl $(2 * VEC_SIZE - 1), %ecx
57 cmpl $VEC_SIZE, %ecx
58 ja L(cros_page_boundary)
59
60 /* Check the first VEC_SIZE bytes. */
61 VPCMPEQ (%rdi), %ymm0, %ymm1
62 vpmovmskb %ymm1, %eax
63 testl %eax, %eax
64
65# ifndef USE_AS_RAWMEMCHR
66 jnz L(first_vec_x0_check)
67 /* Adjust length and check the end of data. */
68 subq $VEC_SIZE, %rdx
69 jbe L(zero)
70# else
71 jnz L(first_vec_x0)
72# endif
73
74 /* Align data for aligned loads in the loop. */
75 addq $VEC_SIZE, %rdi
76 andl $(VEC_SIZE - 1), %ecx
77 andq $-VEC_SIZE, %rdi
78
79# ifndef USE_AS_RAWMEMCHR
80 /* Adjust length. */
81 addq %rcx, %rdx
82
83 subq $(VEC_SIZE * 4), %rdx
84 jbe L(last_4x_vec_or_less)
85# endif
86 jmp L(more_4x_vec)
87
88 .p2align 4
89L(cros_page_boundary):
90 andl $(VEC_SIZE - 1), %ecx
91 andq $-VEC_SIZE, %rdi
92 VPCMPEQ (%rdi), %ymm0, %ymm1
93 vpmovmskb %ymm1, %eax
94 /* Remove the leading bytes. */
95 sarl %cl, %eax
96 testl %eax, %eax
97 jz L(aligned_more)
98 tzcntl %eax, %eax
99# ifndef USE_AS_RAWMEMCHR
100 /* Check the end of data. */
101 cmpq %rax, %rdx
102 jbe L(zero)
103# endif
104 addq %rdi, %rax
105 addq %rcx, %rax
106 VZEROUPPER
107 ret
108
109 .p2align 4
110L(aligned_more):
111# ifndef USE_AS_RAWMEMCHR
112 /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
113 instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
114 overflow. */
115 negq %rcx
116 addq $VEC_SIZE, %rcx
117
118 /* Check the end of data. */
119 subq %rcx, %rdx
120 jbe L(zero)
121# endif
122
123 addq $VEC_SIZE, %rdi
124
125# ifndef USE_AS_RAWMEMCHR
126 subq $(VEC_SIZE * 4), %rdx
127 jbe L(last_4x_vec_or_less)
128# endif
129
130L(more_4x_vec):
131 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
132 since data is only aligned to VEC_SIZE. */
133 VPCMPEQ (%rdi), %ymm0, %ymm1
134 vpmovmskb %ymm1, %eax
135 testl %eax, %eax
136 jnz L(first_vec_x0)
137
138 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
139 vpmovmskb %ymm1, %eax
140 testl %eax, %eax
141 jnz L(first_vec_x1)
142
143 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
144 vpmovmskb %ymm1, %eax
145 testl %eax, %eax
146 jnz L(first_vec_x2)
147
148 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
149 vpmovmskb %ymm1, %eax
150 testl %eax, %eax
151 jnz L(first_vec_x3)
152
153 addq $(VEC_SIZE * 4), %rdi
154
155# ifndef USE_AS_RAWMEMCHR
156 subq $(VEC_SIZE * 4), %rdx
157 jbe L(last_4x_vec_or_less)
158# endif
159
160 /* Align data to 4 * VEC_SIZE. */
161 movq %rdi, %rcx
162 andl $(4 * VEC_SIZE - 1), %ecx
163 andq $-(4 * VEC_SIZE), %rdi
164
165# ifndef USE_AS_RAWMEMCHR
166 /* Adjust length. */
167 addq %rcx, %rdx
168# endif
169
170 .p2align 4
171L(loop_4x_vec):
172 /* Compare 4 * VEC at a time forward. */
173 VPCMPEQ (%rdi), %ymm0, %ymm1
174 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
175 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
176 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
177
178 vpor %ymm1, %ymm2, %ymm5
179 vpor %ymm3, %ymm4, %ymm6
180 vpor %ymm5, %ymm6, %ymm5
181
182 vpmovmskb %ymm5, %eax
183 testl %eax, %eax
184 jnz L(4x_vec_end)
185
186 addq $(VEC_SIZE * 4), %rdi
187
188# ifdef USE_AS_RAWMEMCHR
189 jmp L(loop_4x_vec)
190# else
191 subq $(VEC_SIZE * 4), %rdx
192 ja L(loop_4x_vec)
193
194L(last_4x_vec_or_less):
195 /* Less than 4 * VEC and aligned to VEC_SIZE. */
196 addl $(VEC_SIZE * 2), %edx
197 jle L(last_2x_vec)
198
199 VPCMPEQ (%rdi), %ymm0, %ymm1
200 vpmovmskb %ymm1, %eax
201 testl %eax, %eax
202 jnz L(first_vec_x0)
203
204 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
205 vpmovmskb %ymm1, %eax
206 testl %eax, %eax
207 jnz L(first_vec_x1)
208
209 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
210 vpmovmskb %ymm1, %eax
211 testl %eax, %eax
212
213 jnz L(first_vec_x2_check)
214 subl $VEC_SIZE, %edx
215 jle L(zero)
216
217 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
218 vpmovmskb %ymm1, %eax
219 testl %eax, %eax
220
221 jnz L(first_vec_x3_check)
222 xorl %eax, %eax
223 VZEROUPPER
224 ret
225
226 .p2align 4
227L(last_2x_vec):
228 addl $(VEC_SIZE * 2), %edx
229 VPCMPEQ (%rdi), %ymm0, %ymm1
230 vpmovmskb %ymm1, %eax
231 testl %eax, %eax
232
233 jnz L(first_vec_x0_check)
234 subl $VEC_SIZE, %edx
235 jle L(zero)
236
237 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
238 vpmovmskb %ymm1, %eax
239 testl %eax, %eax
240 jnz L(first_vec_x1_check)
241 xorl %eax, %eax
242 VZEROUPPER
243 ret
244
245 .p2align 4
246L(first_vec_x0_check):
247 tzcntl %eax, %eax
248 /* Check the end of data. */
249 cmpq %rax, %rdx
250 jbe L(zero)
251 addq %rdi, %rax
252 VZEROUPPER
253 ret
254
255 .p2align 4
256L(first_vec_x1_check):
257 tzcntl %eax, %eax
258 /* Check the end of data. */
259 cmpq %rax, %rdx
260 jbe L(zero)
261 addq $VEC_SIZE, %rax
262 addq %rdi, %rax
263 VZEROUPPER
264 ret
265
266 .p2align 4
267L(first_vec_x2_check):
268 tzcntl %eax, %eax
269 /* Check the end of data. */
270 cmpq %rax, %rdx
271 jbe L(zero)
272 addq $(VEC_SIZE * 2), %rax
273 addq %rdi, %rax
274 VZEROUPPER
275 ret
276
277 .p2align 4
278L(first_vec_x3_check):
279 tzcntl %eax, %eax
280 /* Check the end of data. */
281 cmpq %rax, %rdx
282 jbe L(zero)
283 addq $(VEC_SIZE * 3), %rax
284 addq %rdi, %rax
285 VZEROUPPER
286 ret
287
288 .p2align 4
289L(zero):
290 VZEROUPPER
291L(null):
292 xorl %eax, %eax
293 ret
294# endif
295
296 .p2align 4
297L(first_vec_x0):
298 tzcntl %eax, %eax
299 addq %rdi, %rax
300 VZEROUPPER
301 ret
302
303 .p2align 4
304L(first_vec_x1):
305 tzcntl %eax, %eax
306 addq $VEC_SIZE, %rax
307 addq %rdi, %rax
308 VZEROUPPER
309 ret
310
311 .p2align 4
312L(first_vec_x2):
313 tzcntl %eax, %eax
314 addq $(VEC_SIZE * 2), %rax
315 addq %rdi, %rax
316 VZEROUPPER
317 ret
318
319 .p2align 4
320L(4x_vec_end):
321 vpmovmskb %ymm1, %eax
322 testl %eax, %eax
323 jnz L(first_vec_x0)
324 vpmovmskb %ymm2, %eax
325 testl %eax, %eax
326 jnz L(first_vec_x1)
327 vpmovmskb %ymm3, %eax
328 testl %eax, %eax
329 jnz L(first_vec_x2)
330 vpmovmskb %ymm4, %eax
331 testl %eax, %eax
332L(first_vec_x3):
333 tzcntl %eax, %eax
334 addq $(VEC_SIZE * 3), %rax
335 addq %rdi, %rax
336 VZEROUPPER
337 ret
338
339END (MEMCHR)
340#endif
341