1 | /* Copyright (C) 2011-2017 Free Software Foundation, Inc. |
2 | Contributed by Intel Corporation. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* fast SSE2 version with using pmaxub and 64 byte loop */ |
22 | |
23 | .text |
24 | ENTRY(memchr) |
25 | movd %rsi, %xmm1 |
26 | mov %rdi, %rcx |
27 | |
28 | punpcklbw %xmm1, %xmm1 |
29 | test %rdx, %rdx |
30 | jz L(return_null) |
31 | punpcklbw %xmm1, %xmm1 |
32 | |
33 | and $63, %rcx |
34 | pshufd $0, %xmm1, %xmm1 |
35 | |
36 | cmp $48, %rcx |
37 | ja L(crosscache) |
38 | |
39 | movdqu (%rdi), %xmm0 |
40 | pcmpeqb %xmm1, %xmm0 |
41 | pmovmskb %xmm0, %eax |
42 | test %eax, %eax |
43 | |
44 | jnz L(matches_1) |
45 | sub $16, %rdx |
46 | jbe L(return_null) |
47 | add $16, %rdi |
48 | and $15, %rcx |
49 | and $-16, %rdi |
50 | add %rcx, %rdx |
51 | sub $64, %rdx |
52 | jbe L(exit_loop) |
53 | jmp L(loop_prolog) |
54 | |
55 | .p2align 4 |
56 | L(crosscache): |
57 | and $15, %rcx |
58 | and $-16, %rdi |
59 | movdqa (%rdi), %xmm0 |
60 | |
61 | pcmpeqb %xmm1, %xmm0 |
62 | /* Check if there is a match. */ |
63 | pmovmskb %xmm0, %eax |
64 | /* Remove the leading bytes. */ |
65 | sar %cl, %eax |
66 | test %eax, %eax |
67 | je L(unaligned_no_match) |
68 | /* Check which byte is a match. */ |
69 | bsf %eax, %eax |
70 | |
71 | sub %rax, %rdx |
72 | jbe L(return_null) |
73 | add %rdi, %rax |
74 | add %rcx, %rax |
75 | ret |
76 | |
77 | .p2align 4 |
78 | L(unaligned_no_match): |
79 | /* Calculate the last acceptable address and check for possible |
80 | addition overflow by using satured math: |
81 | rdx = rcx + rdx |
82 | rdx |= -(rdx < rcx) */ |
83 | add %rcx, %rdx |
84 | sbb %rax, %rax |
85 | or %rax, %rdx |
86 | sub $16, %rdx |
87 | jbe L(return_null) |
88 | add $16, %rdi |
89 | sub $64, %rdx |
90 | jbe L(exit_loop) |
91 | |
92 | .p2align 4 |
93 | L(loop_prolog): |
94 | movdqa (%rdi), %xmm0 |
95 | pcmpeqb %xmm1, %xmm0 |
96 | pmovmskb %xmm0, %eax |
97 | test %eax, %eax |
98 | jnz L(matches) |
99 | |
100 | movdqa 16(%rdi), %xmm2 |
101 | pcmpeqb %xmm1, %xmm2 |
102 | pmovmskb %xmm2, %eax |
103 | test %eax, %eax |
104 | jnz L(matches16) |
105 | |
106 | movdqa 32(%rdi), %xmm3 |
107 | pcmpeqb %xmm1, %xmm3 |
108 | pmovmskb %xmm3, %eax |
109 | test %eax, %eax |
110 | jnz L(matches32) |
111 | |
112 | movdqa 48(%rdi), %xmm4 |
113 | pcmpeqb %xmm1, %xmm4 |
114 | add $64, %rdi |
115 | pmovmskb %xmm4, %eax |
116 | test %eax, %eax |
117 | jnz L(matches0) |
118 | |
119 | test $0x3f, %rdi |
120 | jz L(align64_loop) |
121 | |
122 | sub $64, %rdx |
123 | jbe L(exit_loop) |
124 | |
125 | movdqa (%rdi), %xmm0 |
126 | pcmpeqb %xmm1, %xmm0 |
127 | pmovmskb %xmm0, %eax |
128 | test %eax, %eax |
129 | jnz L(matches) |
130 | |
131 | movdqa 16(%rdi), %xmm2 |
132 | pcmpeqb %xmm1, %xmm2 |
133 | pmovmskb %xmm2, %eax |
134 | test %eax, %eax |
135 | jnz L(matches16) |
136 | |
137 | movdqa 32(%rdi), %xmm3 |
138 | pcmpeqb %xmm1, %xmm3 |
139 | pmovmskb %xmm3, %eax |
140 | test %eax, %eax |
141 | jnz L(matches32) |
142 | |
143 | movdqa 48(%rdi), %xmm3 |
144 | pcmpeqb %xmm1, %xmm3 |
145 | pmovmskb %xmm3, %eax |
146 | |
147 | add $64, %rdi |
148 | test %eax, %eax |
149 | jnz L(matches0) |
150 | |
151 | mov %rdi, %rcx |
152 | and $-64, %rdi |
153 | and $63, %rcx |
154 | add %rcx, %rdx |
155 | |
156 | .p2align 4 |
157 | L(align64_loop): |
158 | sub $64, %rdx |
159 | jbe L(exit_loop) |
160 | movdqa (%rdi), %xmm0 |
161 | movdqa 16(%rdi), %xmm2 |
162 | movdqa 32(%rdi), %xmm3 |
163 | movdqa 48(%rdi), %xmm4 |
164 | |
165 | pcmpeqb %xmm1, %xmm0 |
166 | pcmpeqb %xmm1, %xmm2 |
167 | pcmpeqb %xmm1, %xmm3 |
168 | pcmpeqb %xmm1, %xmm4 |
169 | |
170 | pmaxub %xmm0, %xmm3 |
171 | pmaxub %xmm2, %xmm4 |
172 | pmaxub %xmm3, %xmm4 |
173 | pmovmskb %xmm4, %eax |
174 | |
175 | add $64, %rdi |
176 | |
177 | test %eax, %eax |
178 | jz L(align64_loop) |
179 | |
180 | sub $64, %rdi |
181 | |
182 | pmovmskb %xmm0, %eax |
183 | test %eax, %eax |
184 | jnz L(matches) |
185 | |
186 | pmovmskb %xmm2, %eax |
187 | test %eax, %eax |
188 | jnz L(matches16) |
189 | |
190 | movdqa 32(%rdi), %xmm3 |
191 | pcmpeqb %xmm1, %xmm3 |
192 | |
193 | pcmpeqb 48(%rdi), %xmm1 |
194 | pmovmskb %xmm3, %eax |
195 | test %eax, %eax |
196 | jnz L(matches32) |
197 | |
198 | pmovmskb %xmm1, %eax |
199 | bsf %eax, %eax |
200 | lea 48(%rdi, %rax), %rax |
201 | ret |
202 | |
203 | .p2align 4 |
204 | L(exit_loop): |
205 | add $32, %rdx |
206 | jle L(exit_loop_32) |
207 | |
208 | movdqa (%rdi), %xmm0 |
209 | pcmpeqb %xmm1, %xmm0 |
210 | pmovmskb %xmm0, %eax |
211 | test %eax, %eax |
212 | jnz L(matches) |
213 | |
214 | movdqa 16(%rdi), %xmm2 |
215 | pcmpeqb %xmm1, %xmm2 |
216 | pmovmskb %xmm2, %eax |
217 | test %eax, %eax |
218 | jnz L(matches16) |
219 | |
220 | movdqa 32(%rdi), %xmm3 |
221 | pcmpeqb %xmm1, %xmm3 |
222 | pmovmskb %xmm3, %eax |
223 | test %eax, %eax |
224 | jnz L(matches32_1) |
225 | sub $16, %rdx |
226 | jle L(return_null) |
227 | |
228 | pcmpeqb 48(%rdi), %xmm1 |
229 | pmovmskb %xmm1, %eax |
230 | test %eax, %eax |
231 | jnz L(matches48_1) |
232 | xor %rax, %rax |
233 | ret |
234 | |
235 | .p2align 4 |
236 | L(exit_loop_32): |
237 | add $32, %rdx |
238 | movdqa (%rdi), %xmm0 |
239 | pcmpeqb %xmm1, %xmm0 |
240 | pmovmskb %xmm0, %eax |
241 | test %eax, %eax |
242 | jnz L(matches_1) |
243 | sub $16, %rdx |
244 | jbe L(return_null) |
245 | |
246 | pcmpeqb 16(%rdi), %xmm1 |
247 | pmovmskb %xmm1, %eax |
248 | test %eax, %eax |
249 | jnz L(matches16_1) |
250 | xor %rax, %rax |
251 | ret |
252 | |
253 | .p2align 4 |
254 | L(matches0): |
255 | bsf %eax, %eax |
256 | lea -16(%rax, %rdi), %rax |
257 | ret |
258 | |
259 | .p2align 4 |
260 | L(matches): |
261 | bsf %eax, %eax |
262 | add %rdi, %rax |
263 | ret |
264 | |
265 | .p2align 4 |
266 | L(matches16): |
267 | bsf %eax, %eax |
268 | lea 16(%rax, %rdi), %rax |
269 | ret |
270 | |
271 | .p2align 4 |
272 | L(matches32): |
273 | bsf %eax, %eax |
274 | lea 32(%rax, %rdi), %rax |
275 | ret |
276 | |
277 | .p2align 4 |
278 | L(matches_1): |
279 | bsf %eax, %eax |
280 | sub %rax, %rdx |
281 | jbe L(return_null) |
282 | add %rdi, %rax |
283 | ret |
284 | |
285 | .p2align 4 |
286 | L(matches16_1): |
287 | bsf %eax, %eax |
288 | sub %rax, %rdx |
289 | jbe L(return_null) |
290 | lea 16(%rdi, %rax), %rax |
291 | ret |
292 | |
293 | .p2align 4 |
294 | L(matches32_1): |
295 | bsf %eax, %eax |
296 | sub %rax, %rdx |
297 | jbe L(return_null) |
298 | lea 32(%rdi, %rax), %rax |
299 | ret |
300 | |
301 | .p2align 4 |
302 | L(matches48_1): |
303 | bsf %eax, %eax |
304 | sub %rax, %rdx |
305 | jbe L(return_null) |
306 | lea 48(%rdi, %rax), %rax |
307 | ret |
308 | |
309 | .p2align 4 |
310 | L(return_null): |
311 | xor %rax, %rax |
312 | ret |
313 | END(memchr) |
314 | |
315 | strong_alias (memchr, __memchr) |
316 | |
317 | libc_hidden_builtin_def(memchr) |
318 | |