1 | /* memchr optimized with SSE2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | #include <sysdep.h> |
21 | |
22 | /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation |
23 | so we need this to build for ISA V2 builds. */ |
24 | #if ISA_SHOULD_BUILD (2) |
25 | |
26 | # ifndef MEMCHR |
27 | # define MEMCHR __memchr_sse2 |
28 | # endif |
29 | # ifdef USE_AS_WMEMCHR |
30 | # define PCMPEQ pcmpeqd |
31 | # define CHAR_PER_VEC 4 |
32 | # else |
33 | # define PCMPEQ pcmpeqb |
34 | # define CHAR_PER_VEC 16 |
35 | # endif |
36 | |
37 | /* fast SSE2 version with using pmaxub and 64 byte loop */ |
38 | |
39 | .text |
40 | ENTRY(MEMCHR) |
41 | movd %esi, %xmm1 |
42 | mov %edi, %ecx |
43 | |
44 | # ifdef __ILP32__ |
45 | /* Clear the upper 32 bits. */ |
46 | movl %edx, %edx |
47 | # endif |
48 | # ifdef USE_AS_WMEMCHR |
49 | test %RDX_LP, %RDX_LP |
50 | jz L(return_null) |
51 | # else |
52 | punpcklbw %xmm1, %xmm1 |
53 | test %RDX_LP, %RDX_LP |
54 | jz L(return_null) |
55 | punpcklbw %xmm1, %xmm1 |
56 | # endif |
57 | |
58 | and $63, %ecx |
59 | pshufd $0, %xmm1, %xmm1 |
60 | |
61 | cmp $48, %ecx |
62 | ja L(crosscache) |
63 | |
64 | movdqu (%rdi), %xmm0 |
65 | PCMPEQ %xmm1, %xmm0 |
66 | pmovmskb %xmm0, %eax |
67 | test %eax, %eax |
68 | |
69 | jnz L(matches_1) |
70 | sub $CHAR_PER_VEC, %rdx |
71 | jbe L(return_null) |
72 | add $16, %rdi |
73 | and $15, %ecx |
74 | and $-16, %rdi |
75 | # ifdef USE_AS_WMEMCHR |
76 | shr $2, %ecx |
77 | # endif |
78 | add %rcx, %rdx |
79 | sub $(CHAR_PER_VEC * 4), %rdx |
80 | jbe L(exit_loop) |
81 | jmp L(loop_prolog) |
82 | |
83 | .p2align 4 |
84 | L(crosscache): |
85 | and $15, %ecx |
86 | and $-16, %rdi |
87 | movdqa (%rdi), %xmm0 |
88 | |
89 | PCMPEQ %xmm1, %xmm0 |
90 | /* Check if there is a match. */ |
91 | pmovmskb %xmm0, %eax |
92 | /* Remove the leading bytes. */ |
93 | sar %cl, %eax |
94 | test %eax, %eax |
95 | je L(unaligned_no_match) |
96 | /* Check which byte is a match. */ |
97 | bsf %eax, %eax |
98 | # ifdef USE_AS_WMEMCHR |
99 | mov %eax, %esi |
100 | shr $2, %esi |
101 | sub %rsi, %rdx |
102 | # else |
103 | sub %rax, %rdx |
104 | # endif |
105 | jbe L(return_null) |
106 | add %rdi, %rax |
107 | add %rcx, %rax |
108 | ret |
109 | |
110 | .p2align 4 |
111 | L(unaligned_no_match): |
112 | /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using |
113 | "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void |
114 | possible addition overflow. */ |
115 | neg %rcx |
116 | add $16, %rcx |
117 | # ifdef USE_AS_WMEMCHR |
118 | shr $2, %ecx |
119 | # endif |
120 | sub %rcx, %rdx |
121 | jbe L(return_null) |
122 | add $16, %rdi |
123 | sub $(CHAR_PER_VEC * 4), %rdx |
124 | jbe L(exit_loop) |
125 | |
126 | .p2align 4 |
127 | L(loop_prolog): |
128 | movdqa (%rdi), %xmm0 |
129 | PCMPEQ %xmm1, %xmm0 |
130 | pmovmskb %xmm0, %eax |
131 | test %eax, %eax |
132 | jnz L(matches) |
133 | |
134 | movdqa 16(%rdi), %xmm2 |
135 | PCMPEQ %xmm1, %xmm2 |
136 | pmovmskb %xmm2, %eax |
137 | test %eax, %eax |
138 | jnz L(matches16) |
139 | |
140 | movdqa 32(%rdi), %xmm3 |
141 | PCMPEQ %xmm1, %xmm3 |
142 | pmovmskb %xmm3, %eax |
143 | test %eax, %eax |
144 | jnz L(matches32) |
145 | |
146 | movdqa 48(%rdi), %xmm4 |
147 | PCMPEQ %xmm1, %xmm4 |
148 | add $64, %rdi |
149 | pmovmskb %xmm4, %eax |
150 | test %eax, %eax |
151 | jnz L(matches0) |
152 | |
153 | test $0x3f, %rdi |
154 | jz L(align64_loop) |
155 | |
156 | sub $(CHAR_PER_VEC * 4), %rdx |
157 | jbe L(exit_loop) |
158 | |
159 | movdqa (%rdi), %xmm0 |
160 | PCMPEQ %xmm1, %xmm0 |
161 | pmovmskb %xmm0, %eax |
162 | test %eax, %eax |
163 | jnz L(matches) |
164 | |
165 | movdqa 16(%rdi), %xmm2 |
166 | PCMPEQ %xmm1, %xmm2 |
167 | pmovmskb %xmm2, %eax |
168 | test %eax, %eax |
169 | jnz L(matches16) |
170 | |
171 | movdqa 32(%rdi), %xmm3 |
172 | PCMPEQ %xmm1, %xmm3 |
173 | pmovmskb %xmm3, %eax |
174 | test %eax, %eax |
175 | jnz L(matches32) |
176 | |
177 | movdqa 48(%rdi), %xmm3 |
178 | PCMPEQ %xmm1, %xmm3 |
179 | pmovmskb %xmm3, %eax |
180 | |
181 | add $64, %rdi |
182 | test %eax, %eax |
183 | jnz L(matches0) |
184 | |
185 | mov %rdi, %rcx |
186 | and $-64, %rdi |
187 | and $63, %ecx |
188 | # ifdef USE_AS_WMEMCHR |
189 | shr $2, %ecx |
190 | # endif |
191 | add %rcx, %rdx |
192 | |
193 | .p2align 4 |
194 | L(align64_loop): |
195 | sub $(CHAR_PER_VEC * 4), %rdx |
196 | jbe L(exit_loop) |
197 | movdqa (%rdi), %xmm0 |
198 | movdqa 16(%rdi), %xmm2 |
199 | movdqa 32(%rdi), %xmm3 |
200 | movdqa 48(%rdi), %xmm4 |
201 | |
202 | PCMPEQ %xmm1, %xmm0 |
203 | PCMPEQ %xmm1, %xmm2 |
204 | PCMPEQ %xmm1, %xmm3 |
205 | PCMPEQ %xmm1, %xmm4 |
206 | |
207 | pmaxub %xmm0, %xmm3 |
208 | pmaxub %xmm2, %xmm4 |
209 | pmaxub %xmm3, %xmm4 |
210 | pmovmskb %xmm4, %eax |
211 | |
212 | add $64, %rdi |
213 | |
214 | test %eax, %eax |
215 | jz L(align64_loop) |
216 | |
217 | sub $64, %rdi |
218 | |
219 | pmovmskb %xmm0, %eax |
220 | test %eax, %eax |
221 | jnz L(matches) |
222 | |
223 | pmovmskb %xmm2, %eax |
224 | test %eax, %eax |
225 | jnz L(matches16) |
226 | |
227 | movdqa 32(%rdi), %xmm3 |
228 | PCMPEQ %xmm1, %xmm3 |
229 | |
230 | PCMPEQ 48(%rdi), %xmm1 |
231 | pmovmskb %xmm3, %eax |
232 | test %eax, %eax |
233 | jnz L(matches32) |
234 | |
235 | pmovmskb %xmm1, %eax |
236 | bsf %eax, %eax |
237 | lea 48(%rdi, %rax), %rax |
238 | ret |
239 | |
240 | .p2align 4 |
241 | L(exit_loop): |
242 | add $(CHAR_PER_VEC * 2), %edx |
243 | jle L(exit_loop_32) |
244 | |
245 | movdqa (%rdi), %xmm0 |
246 | PCMPEQ %xmm1, %xmm0 |
247 | pmovmskb %xmm0, %eax |
248 | test %eax, %eax |
249 | jnz L(matches) |
250 | |
251 | movdqa 16(%rdi), %xmm2 |
252 | PCMPEQ %xmm1, %xmm2 |
253 | pmovmskb %xmm2, %eax |
254 | test %eax, %eax |
255 | jnz L(matches16) |
256 | |
257 | movdqa 32(%rdi), %xmm3 |
258 | PCMPEQ %xmm1, %xmm3 |
259 | pmovmskb %xmm3, %eax |
260 | test %eax, %eax |
261 | jnz L(matches32_1) |
262 | sub $CHAR_PER_VEC, %edx |
263 | jle L(return_null) |
264 | |
265 | PCMPEQ 48(%rdi), %xmm1 |
266 | pmovmskb %xmm1, %eax |
267 | test %eax, %eax |
268 | jnz L(matches48_1) |
269 | xor %eax, %eax |
270 | ret |
271 | |
272 | .p2align 4 |
273 | L(exit_loop_32): |
274 | add $(CHAR_PER_VEC * 2), %edx |
275 | movdqa (%rdi), %xmm0 |
276 | PCMPEQ %xmm1, %xmm0 |
277 | pmovmskb %xmm0, %eax |
278 | test %eax, %eax |
279 | jnz L(matches_1) |
280 | sub $CHAR_PER_VEC, %edx |
281 | jbe L(return_null) |
282 | |
283 | PCMPEQ 16(%rdi), %xmm1 |
284 | pmovmskb %xmm1, %eax |
285 | test %eax, %eax |
286 | jnz L(matches16_1) |
287 | xor %eax, %eax |
288 | ret |
289 | |
290 | .p2align 4 |
291 | L(matches0): |
292 | bsf %eax, %eax |
293 | lea -16(%rax, %rdi), %rax |
294 | ret |
295 | |
296 | .p2align 4 |
297 | L(matches): |
298 | bsf %eax, %eax |
299 | add %rdi, %rax |
300 | ret |
301 | |
302 | .p2align 4 |
303 | L(matches16): |
304 | bsf %eax, %eax |
305 | lea 16(%rax, %rdi), %rax |
306 | ret |
307 | |
308 | .p2align 4 |
309 | L(matches32): |
310 | bsf %eax, %eax |
311 | lea 32(%rax, %rdi), %rax |
312 | ret |
313 | |
314 | .p2align 4 |
315 | L(matches_1): |
316 | bsf %eax, %eax |
317 | # ifdef USE_AS_WMEMCHR |
318 | mov %eax, %esi |
319 | shr $2, %esi |
320 | sub %rsi, %rdx |
321 | # else |
322 | sub %rax, %rdx |
323 | # endif |
324 | jbe L(return_null) |
325 | add %rdi, %rax |
326 | ret |
327 | |
328 | .p2align 4 |
329 | L(matches16_1): |
330 | bsf %eax, %eax |
331 | # ifdef USE_AS_WMEMCHR |
332 | mov %eax, %esi |
333 | shr $2, %esi |
334 | sub %rsi, %rdx |
335 | # else |
336 | sub %rax, %rdx |
337 | # endif |
338 | jbe L(return_null) |
339 | lea 16(%rdi, %rax), %rax |
340 | ret |
341 | |
342 | .p2align 4 |
343 | L(matches32_1): |
344 | bsf %eax, %eax |
345 | # ifdef USE_AS_WMEMCHR |
346 | mov %eax, %esi |
347 | shr $2, %esi |
348 | sub %rsi, %rdx |
349 | # else |
350 | sub %rax, %rdx |
351 | # endif |
352 | jbe L(return_null) |
353 | lea 32(%rdi, %rax), %rax |
354 | ret |
355 | |
356 | .p2align 4 |
357 | L(matches48_1): |
358 | bsf %eax, %eax |
359 | # ifdef USE_AS_WMEMCHR |
360 | mov %eax, %esi |
361 | shr $2, %esi |
362 | sub %rsi, %rdx |
363 | # else |
364 | sub %rax, %rdx |
365 | # endif |
366 | jbe L(return_null) |
367 | lea 48(%rdi, %rax), %rax |
368 | ret |
369 | |
370 | .p2align 4 |
371 | L(return_null): |
372 | xor %eax, %eax |
373 | ret |
374 | END(MEMCHR) |
375 | #endif |
376 | |