1/* Copyright (C) 2011-2021 Free Software Foundation, Inc.
2 Contributed by Intel Corporation.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_WMEMCHR
22# define MEMCHR wmemchr
23# define PCMPEQ pcmpeqd
24# define CHAR_PER_VEC 4
25#else
26# define MEMCHR memchr
27# define PCMPEQ pcmpeqb
28# define CHAR_PER_VEC 16
29#endif
30
31/* fast SSE2 version with using pmaxub and 64 byte loop */
32
33 .text
34ENTRY(MEMCHR)
35 movd %esi, %xmm1
36 mov %edi, %ecx
37
38#ifdef __ILP32__
39 /* Clear the upper 32 bits. */
40 movl %edx, %edx
41#endif
42#ifdef USE_AS_WMEMCHR
43 test %RDX_LP, %RDX_LP
44 jz L(return_null)
45#else
46 punpcklbw %xmm1, %xmm1
47 test %RDX_LP, %RDX_LP
48 jz L(return_null)
49 punpcklbw %xmm1, %xmm1
50#endif
51
52 and $63, %ecx
53 pshufd $0, %xmm1, %xmm1
54
55 cmp $48, %ecx
56 ja L(crosscache)
57
58 movdqu (%rdi), %xmm0
59 PCMPEQ %xmm1, %xmm0
60 pmovmskb %xmm0, %eax
61 test %eax, %eax
62
63 jnz L(matches_1)
64 sub $CHAR_PER_VEC, %rdx
65 jbe L(return_null)
66 add $16, %rdi
67 and $15, %ecx
68 and $-16, %rdi
69#ifdef USE_AS_WMEMCHR
70 shr $2, %ecx
71#endif
72 add %rcx, %rdx
73 sub $(CHAR_PER_VEC * 4), %rdx
74 jbe L(exit_loop)
75 jmp L(loop_prolog)
76
77 .p2align 4
78L(crosscache):
79 and $15, %ecx
80 and $-16, %rdi
81 movdqa (%rdi), %xmm0
82
83 PCMPEQ %xmm1, %xmm0
84 /* Check if there is a match. */
85 pmovmskb %xmm0, %eax
86 /* Remove the leading bytes. */
87 sar %cl, %eax
88 test %eax, %eax
89 je L(unaligned_no_match)
90 /* Check which byte is a match. */
91 bsf %eax, %eax
92#ifdef USE_AS_WMEMCHR
93 mov %eax, %esi
94 shr $2, %esi
95 sub %rsi, %rdx
96#else
97 sub %rax, %rdx
98#endif
99 jbe L(return_null)
100 add %rdi, %rax
101 add %rcx, %rax
102 ret
103
104 .p2align 4
105L(unaligned_no_match):
106 /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
107 "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
108 possible addition overflow. */
109 neg %rcx
110 add $16, %rcx
111#ifdef USE_AS_WMEMCHR
112 shr $2, %ecx
113#endif
114 sub %rcx, %rdx
115 jbe L(return_null)
116 add $16, %rdi
117 sub $(CHAR_PER_VEC * 4), %rdx
118 jbe L(exit_loop)
119
120 .p2align 4
121L(loop_prolog):
122 movdqa (%rdi), %xmm0
123 PCMPEQ %xmm1, %xmm0
124 pmovmskb %xmm0, %eax
125 test %eax, %eax
126 jnz L(matches)
127
128 movdqa 16(%rdi), %xmm2
129 PCMPEQ %xmm1, %xmm2
130 pmovmskb %xmm2, %eax
131 test %eax, %eax
132 jnz L(matches16)
133
134 movdqa 32(%rdi), %xmm3
135 PCMPEQ %xmm1, %xmm3
136 pmovmskb %xmm3, %eax
137 test %eax, %eax
138 jnz L(matches32)
139
140 movdqa 48(%rdi), %xmm4
141 PCMPEQ %xmm1, %xmm4
142 add $64, %rdi
143 pmovmskb %xmm4, %eax
144 test %eax, %eax
145 jnz L(matches0)
146
147 test $0x3f, %rdi
148 jz L(align64_loop)
149
150 sub $(CHAR_PER_VEC * 4), %rdx
151 jbe L(exit_loop)
152
153 movdqa (%rdi), %xmm0
154 PCMPEQ %xmm1, %xmm0
155 pmovmskb %xmm0, %eax
156 test %eax, %eax
157 jnz L(matches)
158
159 movdqa 16(%rdi), %xmm2
160 PCMPEQ %xmm1, %xmm2
161 pmovmskb %xmm2, %eax
162 test %eax, %eax
163 jnz L(matches16)
164
165 movdqa 32(%rdi), %xmm3
166 PCMPEQ %xmm1, %xmm3
167 pmovmskb %xmm3, %eax
168 test %eax, %eax
169 jnz L(matches32)
170
171 movdqa 48(%rdi), %xmm3
172 PCMPEQ %xmm1, %xmm3
173 pmovmskb %xmm3, %eax
174
175 add $64, %rdi
176 test %eax, %eax
177 jnz L(matches0)
178
179 mov %rdi, %rcx
180 and $-64, %rdi
181 and $63, %ecx
182#ifdef USE_AS_WMEMCHR
183 shr $2, %ecx
184#endif
185 add %rcx, %rdx
186
187 .p2align 4
188L(align64_loop):
189 sub $(CHAR_PER_VEC * 4), %rdx
190 jbe L(exit_loop)
191 movdqa (%rdi), %xmm0
192 movdqa 16(%rdi), %xmm2
193 movdqa 32(%rdi), %xmm3
194 movdqa 48(%rdi), %xmm4
195
196 PCMPEQ %xmm1, %xmm0
197 PCMPEQ %xmm1, %xmm2
198 PCMPEQ %xmm1, %xmm3
199 PCMPEQ %xmm1, %xmm4
200
201 pmaxub %xmm0, %xmm3
202 pmaxub %xmm2, %xmm4
203 pmaxub %xmm3, %xmm4
204 pmovmskb %xmm4, %eax
205
206 add $64, %rdi
207
208 test %eax, %eax
209 jz L(align64_loop)
210
211 sub $64, %rdi
212
213 pmovmskb %xmm0, %eax
214 test %eax, %eax
215 jnz L(matches)
216
217 pmovmskb %xmm2, %eax
218 test %eax, %eax
219 jnz L(matches16)
220
221 movdqa 32(%rdi), %xmm3
222 PCMPEQ %xmm1, %xmm3
223
224 PCMPEQ 48(%rdi), %xmm1
225 pmovmskb %xmm3, %eax
226 test %eax, %eax
227 jnz L(matches32)
228
229 pmovmskb %xmm1, %eax
230 bsf %eax, %eax
231 lea 48(%rdi, %rax), %rax
232 ret
233
234 .p2align 4
235L(exit_loop):
236 add $(CHAR_PER_VEC * 2), %edx
237 jle L(exit_loop_32)
238
239 movdqa (%rdi), %xmm0
240 PCMPEQ %xmm1, %xmm0
241 pmovmskb %xmm0, %eax
242 test %eax, %eax
243 jnz L(matches)
244
245 movdqa 16(%rdi), %xmm2
246 PCMPEQ %xmm1, %xmm2
247 pmovmskb %xmm2, %eax
248 test %eax, %eax
249 jnz L(matches16)
250
251 movdqa 32(%rdi), %xmm3
252 PCMPEQ %xmm1, %xmm3
253 pmovmskb %xmm3, %eax
254 test %eax, %eax
255 jnz L(matches32_1)
256 sub $CHAR_PER_VEC, %edx
257 jle L(return_null)
258
259 PCMPEQ 48(%rdi), %xmm1
260 pmovmskb %xmm1, %eax
261 test %eax, %eax
262 jnz L(matches48_1)
263 xor %eax, %eax
264 ret
265
266 .p2align 4
267L(exit_loop_32):
268 add $(CHAR_PER_VEC * 2), %edx
269 movdqa (%rdi), %xmm0
270 PCMPEQ %xmm1, %xmm0
271 pmovmskb %xmm0, %eax
272 test %eax, %eax
273 jnz L(matches_1)
274 sub $CHAR_PER_VEC, %edx
275 jbe L(return_null)
276
277 PCMPEQ 16(%rdi), %xmm1
278 pmovmskb %xmm1, %eax
279 test %eax, %eax
280 jnz L(matches16_1)
281 xor %eax, %eax
282 ret
283
284 .p2align 4
285L(matches0):
286 bsf %eax, %eax
287 lea -16(%rax, %rdi), %rax
288 ret
289
290 .p2align 4
291L(matches):
292 bsf %eax, %eax
293 add %rdi, %rax
294 ret
295
296 .p2align 4
297L(matches16):
298 bsf %eax, %eax
299 lea 16(%rax, %rdi), %rax
300 ret
301
302 .p2align 4
303L(matches32):
304 bsf %eax, %eax
305 lea 32(%rax, %rdi), %rax
306 ret
307
308 .p2align 4
309L(matches_1):
310 bsf %eax, %eax
311#ifdef USE_AS_WMEMCHR
312 mov %eax, %esi
313 shr $2, %esi
314 sub %rsi, %rdx
315#else
316 sub %rax, %rdx
317#endif
318 jbe L(return_null)
319 add %rdi, %rax
320 ret
321
322 .p2align 4
323L(matches16_1):
324 bsf %eax, %eax
325#ifdef USE_AS_WMEMCHR
326 mov %eax, %esi
327 shr $2, %esi
328 sub %rsi, %rdx
329#else
330 sub %rax, %rdx
331#endif
332 jbe L(return_null)
333 lea 16(%rdi, %rax), %rax
334 ret
335
336 .p2align 4
337L(matches32_1):
338 bsf %eax, %eax
339#ifdef USE_AS_WMEMCHR
340 mov %eax, %esi
341 shr $2, %esi
342 sub %rsi, %rdx
343#else
344 sub %rax, %rdx
345#endif
346 jbe L(return_null)
347 lea 32(%rdi, %rax), %rax
348 ret
349
350 .p2align 4
351L(matches48_1):
352 bsf %eax, %eax
353#ifdef USE_AS_WMEMCHR
354 mov %eax, %esi
355 shr $2, %esi
356 sub %rsi, %rdx
357#else
358 sub %rax, %rdx
359#endif
360 jbe L(return_null)
361 lea 48(%rdi, %rax), %rax
362 ret
363
364 .p2align 4
365L(return_null):
366 xor %eax, %eax
367 ret
368END(MEMCHR)
369
370#ifndef USE_AS_WMEMCHR
371strong_alias (memchr, __memchr)
372libc_hidden_builtin_def(memchr)
373#endif
374