1 | /* Copyright (C) 2011-2021 Free Software Foundation, Inc. |
2 | Contributed by Intel Corporation. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifdef USE_AS_WMEMCHR |
22 | # define MEMCHR wmemchr |
23 | # define PCMPEQ pcmpeqd |
24 | #else |
25 | # define MEMCHR memchr |
26 | # define PCMPEQ pcmpeqb |
27 | #endif |
28 | |
29 | /* fast SSE2 version with using pmaxub and 64 byte loop */ |
30 | |
31 | .text |
32 | ENTRY(MEMCHR) |
33 | movd %esi, %xmm1 |
34 | mov %edi, %ecx |
35 | |
36 | #ifdef USE_AS_WMEMCHR |
37 | test %RDX_LP, %RDX_LP |
38 | jz L(return_null) |
39 | shl $2, %RDX_LP |
40 | #else |
41 | # ifdef __ILP32__ |
42 | /* Clear the upper 32 bits. */ |
43 | movl %edx, %edx |
44 | # endif |
45 | punpcklbw %xmm1, %xmm1 |
46 | test %RDX_LP, %RDX_LP |
47 | jz L(return_null) |
48 | punpcklbw %xmm1, %xmm1 |
49 | #endif |
50 | |
51 | and $63, %ecx |
52 | pshufd $0, %xmm1, %xmm1 |
53 | |
54 | cmp $48, %ecx |
55 | ja L(crosscache) |
56 | |
57 | movdqu (%rdi), %xmm0 |
58 | PCMPEQ %xmm1, %xmm0 |
59 | pmovmskb %xmm0, %eax |
60 | test %eax, %eax |
61 | |
62 | jnz L(matches_1) |
63 | sub $16, %rdx |
64 | jbe L(return_null) |
65 | add $16, %rdi |
66 | and $15, %ecx |
67 | and $-16, %rdi |
68 | add %rcx, %rdx |
69 | sub $64, %rdx |
70 | jbe L(exit_loop) |
71 | jmp L(loop_prolog) |
72 | |
73 | .p2align 4 |
74 | L(crosscache): |
75 | and $15, %ecx |
76 | and $-16, %rdi |
77 | movdqa (%rdi), %xmm0 |
78 | |
79 | PCMPEQ %xmm1, %xmm0 |
80 | /* Check if there is a match. */ |
81 | pmovmskb %xmm0, %eax |
82 | /* Remove the leading bytes. */ |
83 | sar %cl, %eax |
84 | test %eax, %eax |
85 | je L(unaligned_no_match) |
86 | /* Check which byte is a match. */ |
87 | bsf %eax, %eax |
88 | |
89 | sub %rax, %rdx |
90 | jbe L(return_null) |
91 | add %rdi, %rax |
92 | add %rcx, %rax |
93 | ret |
94 | |
95 | .p2align 4 |
96 | L(unaligned_no_match): |
97 | /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using |
98 | "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void |
99 | possible addition overflow. */ |
100 | neg %rcx |
101 | add $16, %rcx |
102 | sub %rcx, %rdx |
103 | jbe L(return_null) |
104 | add $16, %rdi |
105 | sub $64, %rdx |
106 | jbe L(exit_loop) |
107 | |
108 | .p2align 4 |
109 | L(loop_prolog): |
110 | movdqa (%rdi), %xmm0 |
111 | PCMPEQ %xmm1, %xmm0 |
112 | pmovmskb %xmm0, %eax |
113 | test %eax, %eax |
114 | jnz L(matches) |
115 | |
116 | movdqa 16(%rdi), %xmm2 |
117 | PCMPEQ %xmm1, %xmm2 |
118 | pmovmskb %xmm2, %eax |
119 | test %eax, %eax |
120 | jnz L(matches16) |
121 | |
122 | movdqa 32(%rdi), %xmm3 |
123 | PCMPEQ %xmm1, %xmm3 |
124 | pmovmskb %xmm3, %eax |
125 | test %eax, %eax |
126 | jnz L(matches32) |
127 | |
128 | movdqa 48(%rdi), %xmm4 |
129 | PCMPEQ %xmm1, %xmm4 |
130 | add $64, %rdi |
131 | pmovmskb %xmm4, %eax |
132 | test %eax, %eax |
133 | jnz L(matches0) |
134 | |
135 | test $0x3f, %rdi |
136 | jz L(align64_loop) |
137 | |
138 | sub $64, %rdx |
139 | jbe L(exit_loop) |
140 | |
141 | movdqa (%rdi), %xmm0 |
142 | PCMPEQ %xmm1, %xmm0 |
143 | pmovmskb %xmm0, %eax |
144 | test %eax, %eax |
145 | jnz L(matches) |
146 | |
147 | movdqa 16(%rdi), %xmm2 |
148 | PCMPEQ %xmm1, %xmm2 |
149 | pmovmskb %xmm2, %eax |
150 | test %eax, %eax |
151 | jnz L(matches16) |
152 | |
153 | movdqa 32(%rdi), %xmm3 |
154 | PCMPEQ %xmm1, %xmm3 |
155 | pmovmskb %xmm3, %eax |
156 | test %eax, %eax |
157 | jnz L(matches32) |
158 | |
159 | movdqa 48(%rdi), %xmm3 |
160 | PCMPEQ %xmm1, %xmm3 |
161 | pmovmskb %xmm3, %eax |
162 | |
163 | add $64, %rdi |
164 | test %eax, %eax |
165 | jnz L(matches0) |
166 | |
167 | mov %rdi, %rcx |
168 | and $-64, %rdi |
169 | and $63, %ecx |
170 | add %rcx, %rdx |
171 | |
172 | .p2align 4 |
173 | L(align64_loop): |
174 | sub $64, %rdx |
175 | jbe L(exit_loop) |
176 | movdqa (%rdi), %xmm0 |
177 | movdqa 16(%rdi), %xmm2 |
178 | movdqa 32(%rdi), %xmm3 |
179 | movdqa 48(%rdi), %xmm4 |
180 | |
181 | PCMPEQ %xmm1, %xmm0 |
182 | PCMPEQ %xmm1, %xmm2 |
183 | PCMPEQ %xmm1, %xmm3 |
184 | PCMPEQ %xmm1, %xmm4 |
185 | |
186 | pmaxub %xmm0, %xmm3 |
187 | pmaxub %xmm2, %xmm4 |
188 | pmaxub %xmm3, %xmm4 |
189 | pmovmskb %xmm4, %eax |
190 | |
191 | add $64, %rdi |
192 | |
193 | test %eax, %eax |
194 | jz L(align64_loop) |
195 | |
196 | sub $64, %rdi |
197 | |
198 | pmovmskb %xmm0, %eax |
199 | test %eax, %eax |
200 | jnz L(matches) |
201 | |
202 | pmovmskb %xmm2, %eax |
203 | test %eax, %eax |
204 | jnz L(matches16) |
205 | |
206 | movdqa 32(%rdi), %xmm3 |
207 | PCMPEQ %xmm1, %xmm3 |
208 | |
209 | PCMPEQ 48(%rdi), %xmm1 |
210 | pmovmskb %xmm3, %eax |
211 | test %eax, %eax |
212 | jnz L(matches32) |
213 | |
214 | pmovmskb %xmm1, %eax |
215 | bsf %eax, %eax |
216 | lea 48(%rdi, %rax), %rax |
217 | ret |
218 | |
219 | .p2align 4 |
220 | L(exit_loop): |
221 | add $32, %edx |
222 | jle L(exit_loop_32) |
223 | |
224 | movdqa (%rdi), %xmm0 |
225 | PCMPEQ %xmm1, %xmm0 |
226 | pmovmskb %xmm0, %eax |
227 | test %eax, %eax |
228 | jnz L(matches) |
229 | |
230 | movdqa 16(%rdi), %xmm2 |
231 | PCMPEQ %xmm1, %xmm2 |
232 | pmovmskb %xmm2, %eax |
233 | test %eax, %eax |
234 | jnz L(matches16) |
235 | |
236 | movdqa 32(%rdi), %xmm3 |
237 | PCMPEQ %xmm1, %xmm3 |
238 | pmovmskb %xmm3, %eax |
239 | test %eax, %eax |
240 | jnz L(matches32_1) |
241 | sub $16, %edx |
242 | jle L(return_null) |
243 | |
244 | PCMPEQ 48(%rdi), %xmm1 |
245 | pmovmskb %xmm1, %eax |
246 | test %eax, %eax |
247 | jnz L(matches48_1) |
248 | xor %eax, %eax |
249 | ret |
250 | |
251 | .p2align 4 |
252 | L(exit_loop_32): |
253 | add $32, %edx |
254 | movdqa (%rdi), %xmm0 |
255 | PCMPEQ %xmm1, %xmm0 |
256 | pmovmskb %xmm0, %eax |
257 | test %eax, %eax |
258 | jnz L(matches_1) |
259 | sub $16, %edx |
260 | jbe L(return_null) |
261 | |
262 | PCMPEQ 16(%rdi), %xmm1 |
263 | pmovmskb %xmm1, %eax |
264 | test %eax, %eax |
265 | jnz L(matches16_1) |
266 | xor %eax, %eax |
267 | ret |
268 | |
269 | .p2align 4 |
270 | L(matches0): |
271 | bsf %eax, %eax |
272 | lea -16(%rax, %rdi), %rax |
273 | ret |
274 | |
275 | .p2align 4 |
276 | L(matches): |
277 | bsf %eax, %eax |
278 | add %rdi, %rax |
279 | ret |
280 | |
281 | .p2align 4 |
282 | L(matches16): |
283 | bsf %eax, %eax |
284 | lea 16(%rax, %rdi), %rax |
285 | ret |
286 | |
287 | .p2align 4 |
288 | L(matches32): |
289 | bsf %eax, %eax |
290 | lea 32(%rax, %rdi), %rax |
291 | ret |
292 | |
293 | .p2align 4 |
294 | L(matches_1): |
295 | bsf %eax, %eax |
296 | sub %rax, %rdx |
297 | jbe L(return_null) |
298 | add %rdi, %rax |
299 | ret |
300 | |
301 | .p2align 4 |
302 | L(matches16_1): |
303 | bsf %eax, %eax |
304 | sub %rax, %rdx |
305 | jbe L(return_null) |
306 | lea 16(%rdi, %rax), %rax |
307 | ret |
308 | |
309 | .p2align 4 |
310 | L(matches32_1): |
311 | bsf %eax, %eax |
312 | sub %rax, %rdx |
313 | jbe L(return_null) |
314 | lea 32(%rdi, %rax), %rax |
315 | ret |
316 | |
317 | .p2align 4 |
318 | L(matches48_1): |
319 | bsf %eax, %eax |
320 | sub %rax, %rdx |
321 | jbe L(return_null) |
322 | lea 48(%rdi, %rax), %rax |
323 | ret |
324 | |
325 | .p2align 4 |
326 | L(return_null): |
327 | xor %eax, %eax |
328 | ret |
329 | END(MEMCHR) |
330 | |
331 | #ifndef USE_AS_WMEMCHR |
332 | strong_alias (memchr, __memchr) |
333 | libc_hidden_builtin_def(memchr) |
334 | #endif |
335 | |