1 | /* Copyright (C) 2011-2016 Free Software Foundation, Inc. |
2 | Contributed by Intel Corporation. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* fast SSE2 version with using pmaxub and 64 byte loop */ |
22 | |
23 | .text |
24 | ENTRY(memchr) |
25 | movd %rsi, %xmm1 |
26 | mov %rdi, %rcx |
27 | |
28 | punpcklbw %xmm1, %xmm1 |
29 | test %rdx, %rdx |
30 | jz L(return_null) |
31 | punpcklbw %xmm1, %xmm1 |
32 | |
33 | and $63, %rcx |
34 | pshufd $0, %xmm1, %xmm1 |
35 | |
36 | cmp $48, %rcx |
37 | ja L(crosscache) |
38 | |
39 | movdqu (%rdi), %xmm0 |
40 | pcmpeqb %xmm1, %xmm0 |
41 | pmovmskb %xmm0, %eax |
42 | test %eax, %eax |
43 | |
44 | jnz L(matches_1) |
45 | sub $16, %rdx |
46 | jbe L(return_null) |
47 | add $16, %rdi |
48 | and $15, %rcx |
49 | and $-16, %rdi |
50 | add %rcx, %rdx |
51 | sub $64, %rdx |
52 | jbe L(exit_loop) |
53 | jmp L(loop_prolog) |
54 | |
55 | .p2align 4 |
56 | L(crosscache): |
57 | and $15, %rcx |
58 | and $-16, %rdi |
59 | movdqa (%rdi), %xmm0 |
60 | |
61 | pcmpeqb %xmm1, %xmm0 |
62 | /* Check if there is a match. */ |
63 | pmovmskb %xmm0, %eax |
64 | /* Remove the leading bytes. */ |
65 | sar %cl, %eax |
66 | test %eax, %eax |
67 | je L(unaligned_no_match) |
68 | /* Check which byte is a match. */ |
69 | bsf %eax, %eax |
70 | |
71 | sub %rax, %rdx |
72 | jbe L(return_null) |
73 | add %rdi, %rax |
74 | add %rcx, %rax |
75 | ret |
76 | |
77 | .p2align 4 |
78 | L(unaligned_no_match): |
79 | add %rcx, %rdx |
80 | sub $16, %rdx |
81 | jbe L(return_null) |
82 | add $16, %rdi |
83 | sub $64, %rdx |
84 | jbe L(exit_loop) |
85 | |
86 | .p2align 4 |
87 | L(loop_prolog): |
88 | movdqa (%rdi), %xmm0 |
89 | pcmpeqb %xmm1, %xmm0 |
90 | pmovmskb %xmm0, %eax |
91 | test %eax, %eax |
92 | jnz L(matches) |
93 | |
94 | movdqa 16(%rdi), %xmm2 |
95 | pcmpeqb %xmm1, %xmm2 |
96 | pmovmskb %xmm2, %eax |
97 | test %eax, %eax |
98 | jnz L(matches16) |
99 | |
100 | movdqa 32(%rdi), %xmm3 |
101 | pcmpeqb %xmm1, %xmm3 |
102 | pmovmskb %xmm3, %eax |
103 | test %eax, %eax |
104 | jnz L(matches32) |
105 | |
106 | movdqa 48(%rdi), %xmm4 |
107 | pcmpeqb %xmm1, %xmm4 |
108 | add $64, %rdi |
109 | pmovmskb %xmm4, %eax |
110 | test %eax, %eax |
111 | jnz L(matches0) |
112 | |
113 | test $0x3f, %rdi |
114 | jz L(align64_loop) |
115 | |
116 | sub $64, %rdx |
117 | jbe L(exit_loop) |
118 | |
119 | movdqa (%rdi), %xmm0 |
120 | pcmpeqb %xmm1, %xmm0 |
121 | pmovmskb %xmm0, %eax |
122 | test %eax, %eax |
123 | jnz L(matches) |
124 | |
125 | movdqa 16(%rdi), %xmm2 |
126 | pcmpeqb %xmm1, %xmm2 |
127 | pmovmskb %xmm2, %eax |
128 | test %eax, %eax |
129 | jnz L(matches16) |
130 | |
131 | movdqa 32(%rdi), %xmm3 |
132 | pcmpeqb %xmm1, %xmm3 |
133 | pmovmskb %xmm3, %eax |
134 | test %eax, %eax |
135 | jnz L(matches32) |
136 | |
137 | movdqa 48(%rdi), %xmm3 |
138 | pcmpeqb %xmm1, %xmm3 |
139 | pmovmskb %xmm3, %eax |
140 | |
141 | add $64, %rdi |
142 | test %eax, %eax |
143 | jnz L(matches0) |
144 | |
145 | mov %rdi, %rcx |
146 | and $-64, %rdi |
147 | and $63, %rcx |
148 | add %rcx, %rdx |
149 | |
150 | .p2align 4 |
151 | L(align64_loop): |
152 | sub $64, %rdx |
153 | jbe L(exit_loop) |
154 | movdqa (%rdi), %xmm0 |
155 | movdqa 16(%rdi), %xmm2 |
156 | movdqa 32(%rdi), %xmm3 |
157 | movdqa 48(%rdi), %xmm4 |
158 | |
159 | pcmpeqb %xmm1, %xmm0 |
160 | pcmpeqb %xmm1, %xmm2 |
161 | pcmpeqb %xmm1, %xmm3 |
162 | pcmpeqb %xmm1, %xmm4 |
163 | |
164 | pmaxub %xmm0, %xmm3 |
165 | pmaxub %xmm2, %xmm4 |
166 | pmaxub %xmm3, %xmm4 |
167 | pmovmskb %xmm4, %eax |
168 | |
169 | add $64, %rdi |
170 | |
171 | test %eax, %eax |
172 | jz L(align64_loop) |
173 | |
174 | sub $64, %rdi |
175 | |
176 | pmovmskb %xmm0, %eax |
177 | test %eax, %eax |
178 | jnz L(matches) |
179 | |
180 | pmovmskb %xmm2, %eax |
181 | test %eax, %eax |
182 | jnz L(matches16) |
183 | |
184 | movdqa 32(%rdi), %xmm3 |
185 | pcmpeqb %xmm1, %xmm3 |
186 | |
187 | pcmpeqb 48(%rdi), %xmm1 |
188 | pmovmskb %xmm3, %eax |
189 | test %eax, %eax |
190 | jnz L(matches32) |
191 | |
192 | pmovmskb %xmm1, %eax |
193 | bsf %eax, %eax |
194 | lea 48(%rdi, %rax), %rax |
195 | ret |
196 | |
197 | .p2align 4 |
198 | L(exit_loop): |
199 | add $32, %rdx |
200 | jle L(exit_loop_32) |
201 | |
202 | movdqa (%rdi), %xmm0 |
203 | pcmpeqb %xmm1, %xmm0 |
204 | pmovmskb %xmm0, %eax |
205 | test %eax, %eax |
206 | jnz L(matches) |
207 | |
208 | movdqa 16(%rdi), %xmm2 |
209 | pcmpeqb %xmm1, %xmm2 |
210 | pmovmskb %xmm2, %eax |
211 | test %eax, %eax |
212 | jnz L(matches16) |
213 | |
214 | movdqa 32(%rdi), %xmm3 |
215 | pcmpeqb %xmm1, %xmm3 |
216 | pmovmskb %xmm3, %eax |
217 | test %eax, %eax |
218 | jnz L(matches32_1) |
219 | sub $16, %rdx |
220 | jle L(return_null) |
221 | |
222 | pcmpeqb 48(%rdi), %xmm1 |
223 | pmovmskb %xmm1, %eax |
224 | test %eax, %eax |
225 | jnz L(matches48_1) |
226 | xor %rax, %rax |
227 | ret |
228 | |
229 | .p2align 4 |
230 | L(exit_loop_32): |
231 | add $32, %rdx |
232 | movdqa (%rdi), %xmm0 |
233 | pcmpeqb %xmm1, %xmm0 |
234 | pmovmskb %xmm0, %eax |
235 | test %eax, %eax |
236 | jnz L(matches_1) |
237 | sub $16, %rdx |
238 | jbe L(return_null) |
239 | |
240 | pcmpeqb 16(%rdi), %xmm1 |
241 | pmovmskb %xmm1, %eax |
242 | test %eax, %eax |
243 | jnz L(matches16_1) |
244 | xor %rax, %rax |
245 | ret |
246 | |
247 | .p2align 4 |
248 | L(matches0): |
249 | bsf %eax, %eax |
250 | lea -16(%rax, %rdi), %rax |
251 | ret |
252 | |
253 | .p2align 4 |
254 | L(matches): |
255 | bsf %eax, %eax |
256 | add %rdi, %rax |
257 | ret |
258 | |
259 | .p2align 4 |
260 | L(matches16): |
261 | bsf %eax, %eax |
262 | lea 16(%rax, %rdi), %rax |
263 | ret |
264 | |
265 | .p2align 4 |
266 | L(matches32): |
267 | bsf %eax, %eax |
268 | lea 32(%rax, %rdi), %rax |
269 | ret |
270 | |
271 | .p2align 4 |
272 | L(matches_1): |
273 | bsf %eax, %eax |
274 | sub %rax, %rdx |
275 | jbe L(return_null) |
276 | add %rdi, %rax |
277 | ret |
278 | |
279 | .p2align 4 |
280 | L(matches16_1): |
281 | bsf %eax, %eax |
282 | sub %rax, %rdx |
283 | jbe L(return_null) |
284 | lea 16(%rdi, %rax), %rax |
285 | ret |
286 | |
287 | .p2align 4 |
288 | L(matches32_1): |
289 | bsf %eax, %eax |
290 | sub %rax, %rdx |
291 | jbe L(return_null) |
292 | lea 32(%rdi, %rax), %rax |
293 | ret |
294 | |
295 | .p2align 4 |
296 | L(matches48_1): |
297 | bsf %eax, %eax |
298 | sub %rax, %rdx |
299 | jbe L(return_null) |
300 | lea 48(%rdi, %rax), %rax |
301 | ret |
302 | |
303 | .p2align 4 |
304 | L(return_null): |
305 | xor %rax, %rax |
306 | ret |
307 | END(memchr) |
308 | |
309 | strong_alias (memchr, __memchr) |
310 | |
311 | libc_hidden_builtin_def(memchr) |
312 | |