1 | /* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using |
2 | |
3 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | .text |
23 | ENTRY (__memrchr) |
24 | movd %esi, %xmm1 |
25 | |
26 | sub $16, %RDX_LP |
27 | jbe L(length_less16) |
28 | |
29 | punpcklbw %xmm1, %xmm1 |
30 | punpcklbw %xmm1, %xmm1 |
31 | |
32 | add %RDX_LP, %RDI_LP |
33 | pshufd $0, %xmm1, %xmm1 |
34 | |
35 | movdqu (%rdi), %xmm0 |
36 | pcmpeqb %xmm1, %xmm0 |
37 | |
38 | /* Check if there is a match. */ |
39 | pmovmskb %xmm0, %eax |
40 | test %eax, %eax |
41 | jnz L(matches0) |
42 | |
43 | sub $64, %rdi |
44 | mov %edi, %ecx |
45 | and $15, %ecx |
46 | jz L(loop_prolog) |
47 | |
48 | add $16, %rdi |
49 | add $16, %rdx |
50 | and $-16, %rdi |
51 | sub %rcx, %rdx |
52 | |
53 | .p2align 4 |
54 | L(loop_prolog): |
55 | sub $64, %rdx |
56 | jbe L(exit_loop) |
57 | |
58 | movdqa 48(%rdi), %xmm0 |
59 | pcmpeqb %xmm1, %xmm0 |
60 | pmovmskb %xmm0, %eax |
61 | test %eax, %eax |
62 | jnz L(matches48) |
63 | |
64 | movdqa 32(%rdi), %xmm2 |
65 | pcmpeqb %xmm1, %xmm2 |
66 | pmovmskb %xmm2, %eax |
67 | test %eax, %eax |
68 | jnz L(matches32) |
69 | |
70 | movdqa 16(%rdi), %xmm3 |
71 | pcmpeqb %xmm1, %xmm3 |
72 | pmovmskb %xmm3, %eax |
73 | test %eax, %eax |
74 | jnz L(matches16) |
75 | |
76 | movdqa (%rdi), %xmm4 |
77 | pcmpeqb %xmm1, %xmm4 |
78 | pmovmskb %xmm4, %eax |
79 | test %eax, %eax |
80 | jnz L(matches0) |
81 | |
82 | sub $64, %rdi |
83 | sub $64, %rdx |
84 | jbe L(exit_loop) |
85 | |
86 | movdqa 48(%rdi), %xmm0 |
87 | pcmpeqb %xmm1, %xmm0 |
88 | pmovmskb %xmm0, %eax |
89 | test %eax, %eax |
90 | jnz L(matches48) |
91 | |
92 | movdqa 32(%rdi), %xmm2 |
93 | pcmpeqb %xmm1, %xmm2 |
94 | pmovmskb %xmm2, %eax |
95 | test %eax, %eax |
96 | jnz L(matches32) |
97 | |
98 | movdqa 16(%rdi), %xmm3 |
99 | pcmpeqb %xmm1, %xmm3 |
100 | pmovmskb %xmm3, %eax |
101 | test %eax, %eax |
102 | jnz L(matches16) |
103 | |
104 | movdqa (%rdi), %xmm3 |
105 | pcmpeqb %xmm1, %xmm3 |
106 | pmovmskb %xmm3, %eax |
107 | test %eax, %eax |
108 | jnz L(matches0) |
109 | |
110 | mov %edi, %ecx |
111 | and $63, %ecx |
112 | jz L(align64_loop) |
113 | |
114 | add $64, %rdi |
115 | add $64, %rdx |
116 | and $-64, %rdi |
117 | sub %rcx, %rdx |
118 | |
119 | .p2align 4 |
120 | L(align64_loop): |
121 | sub $64, %rdi |
122 | sub $64, %rdx |
123 | jbe L(exit_loop) |
124 | |
125 | movdqa (%rdi), %xmm0 |
126 | movdqa 16(%rdi), %xmm2 |
127 | movdqa 32(%rdi), %xmm3 |
128 | movdqa 48(%rdi), %xmm4 |
129 | |
130 | pcmpeqb %xmm1, %xmm0 |
131 | pcmpeqb %xmm1, %xmm2 |
132 | pcmpeqb %xmm1, %xmm3 |
133 | pcmpeqb %xmm1, %xmm4 |
134 | |
135 | pmaxub %xmm3, %xmm0 |
136 | pmaxub %xmm4, %xmm2 |
137 | pmaxub %xmm0, %xmm2 |
138 | pmovmskb %xmm2, %eax |
139 | |
140 | test %eax, %eax |
141 | jz L(align64_loop) |
142 | |
143 | pmovmskb %xmm4, %eax |
144 | test %eax, %eax |
145 | jnz L(matches48) |
146 | |
147 | pmovmskb %xmm3, %eax |
148 | test %eax, %eax |
149 | jnz L(matches32) |
150 | |
151 | movdqa 16(%rdi), %xmm2 |
152 | |
153 | pcmpeqb %xmm1, %xmm2 |
154 | pcmpeqb (%rdi), %xmm1 |
155 | |
156 | pmovmskb %xmm2, %eax |
157 | test %eax, %eax |
158 | jnz L(matches16) |
159 | |
160 | pmovmskb %xmm1, %eax |
161 | bsr %eax, %eax |
162 | |
163 | add %rdi, %rax |
164 | ret |
165 | |
166 | .p2align 4 |
167 | L(exit_loop): |
168 | add $64, %edx |
169 | cmp $32, %edx |
170 | jbe L(exit_loop_32) |
171 | |
172 | movdqa 48(%rdi), %xmm0 |
173 | pcmpeqb %xmm1, %xmm0 |
174 | pmovmskb %xmm0, %eax |
175 | test %eax, %eax |
176 | jnz L(matches48) |
177 | |
178 | movdqa 32(%rdi), %xmm2 |
179 | pcmpeqb %xmm1, %xmm2 |
180 | pmovmskb %xmm2, %eax |
181 | test %eax, %eax |
182 | jnz L(matches32) |
183 | |
184 | movdqa 16(%rdi), %xmm3 |
185 | pcmpeqb %xmm1, %xmm3 |
186 | pmovmskb %xmm3, %eax |
187 | test %eax, %eax |
188 | jnz L(matches16_1) |
189 | cmp $48, %edx |
190 | jbe L(return_null) |
191 | |
192 | pcmpeqb (%rdi), %xmm1 |
193 | pmovmskb %xmm1, %eax |
194 | test %eax, %eax |
195 | jnz L(matches0_1) |
196 | xor %eax, %eax |
197 | ret |
198 | |
199 | .p2align 4 |
200 | L(exit_loop_32): |
201 | movdqa 48(%rdi), %xmm0 |
202 | pcmpeqb %xmm1, %xmm0 |
203 | pmovmskb %xmm0, %eax |
204 | test %eax, %eax |
205 | jnz L(matches48_1) |
206 | cmp $16, %edx |
207 | jbe L(return_null) |
208 | |
209 | pcmpeqb 32(%rdi), %xmm1 |
210 | pmovmskb %xmm1, %eax |
211 | test %eax, %eax |
212 | jnz L(matches32_1) |
213 | xor %eax, %eax |
214 | ret |
215 | |
216 | .p2align 4 |
217 | L(matches0): |
218 | bsr %eax, %eax |
219 | add %rdi, %rax |
220 | ret |
221 | |
222 | .p2align 4 |
223 | L(matches16): |
224 | bsr %eax, %eax |
225 | lea 16(%rax, %rdi), %rax |
226 | ret |
227 | |
228 | .p2align 4 |
229 | L(matches32): |
230 | bsr %eax, %eax |
231 | lea 32(%rax, %rdi), %rax |
232 | ret |
233 | |
234 | .p2align 4 |
235 | L(matches48): |
236 | bsr %eax, %eax |
237 | lea 48(%rax, %rdi), %rax |
238 | ret |
239 | |
240 | .p2align 4 |
241 | L(matches0_1): |
242 | bsr %eax, %eax |
243 | sub $64, %rdx |
244 | add %rax, %rdx |
245 | jl L(return_null) |
246 | add %rdi, %rax |
247 | ret |
248 | |
249 | .p2align 4 |
250 | L(matches16_1): |
251 | bsr %eax, %eax |
252 | sub $48, %rdx |
253 | add %rax, %rdx |
254 | jl L(return_null) |
255 | lea 16(%rdi, %rax), %rax |
256 | ret |
257 | |
258 | .p2align 4 |
259 | L(matches32_1): |
260 | bsr %eax, %eax |
261 | sub $32, %rdx |
262 | add %rax, %rdx |
263 | jl L(return_null) |
264 | lea 32(%rdi, %rax), %rax |
265 | ret |
266 | |
267 | .p2align 4 |
268 | L(matches48_1): |
269 | bsr %eax, %eax |
270 | sub $16, %rdx |
271 | add %rax, %rdx |
272 | jl L(return_null) |
273 | lea 48(%rdi, %rax), %rax |
274 | ret |
275 | |
276 | .p2align 4 |
277 | L(return_null): |
278 | xor %eax, %eax |
279 | ret |
280 | |
281 | .p2align 4 |
282 | L(length_less16_offset0): |
283 | test %edx, %edx |
284 | jz L(return_null) |
285 | |
286 | mov %dl, %cl |
287 | pcmpeqb (%rdi), %xmm1 |
288 | |
289 | mov $1, %edx |
290 | sal %cl, %edx |
291 | sub $1, %edx |
292 | |
293 | pmovmskb %xmm1, %eax |
294 | |
295 | and %edx, %eax |
296 | test %eax, %eax |
297 | jz L(return_null) |
298 | |
299 | bsr %eax, %eax |
300 | add %rdi, %rax |
301 | ret |
302 | |
303 | .p2align 4 |
304 | L(length_less16): |
305 | punpcklbw %xmm1, %xmm1 |
306 | punpcklbw %xmm1, %xmm1 |
307 | |
308 | add $16, %edx |
309 | |
310 | pshufd $0, %xmm1, %xmm1 |
311 | |
312 | mov %edi, %ecx |
313 | and $15, %ecx |
314 | jz L(length_less16_offset0) |
315 | |
316 | mov %cl, %dh |
317 | mov %ecx, %esi |
318 | add %dl, %dh |
319 | and $-16, %rdi |
320 | |
321 | sub $16, %dh |
322 | ja L(length_less16_part2) |
323 | |
324 | pcmpeqb (%rdi), %xmm1 |
325 | pmovmskb %xmm1, %eax |
326 | |
327 | sar %cl, %eax |
328 | mov %dl, %cl |
329 | |
330 | mov $1, %edx |
331 | sal %cl, %edx |
332 | sub $1, %edx |
333 | |
334 | and %edx, %eax |
335 | test %eax, %eax |
336 | jz L(return_null) |
337 | |
338 | bsr %eax, %eax |
339 | add %rdi, %rax |
340 | add %rsi, %rax |
341 | ret |
342 | |
343 | .p2align 4 |
344 | L(length_less16_part2): |
345 | movdqa 16(%rdi), %xmm2 |
346 | pcmpeqb %xmm1, %xmm2 |
347 | pmovmskb %xmm2, %eax |
348 | |
349 | mov %dh, %cl |
350 | mov $1, %edx |
351 | sal %cl, %edx |
352 | sub $1, %edx |
353 | |
354 | and %edx, %eax |
355 | |
356 | test %eax, %eax |
357 | jnz L(length_less16_part2_return) |
358 | |
359 | pcmpeqb (%rdi), %xmm1 |
360 | pmovmskb %xmm1, %eax |
361 | |
362 | mov %esi, %ecx |
363 | sar %cl, %eax |
364 | test %eax, %eax |
365 | jz L(return_null) |
366 | |
367 | bsr %eax, %eax |
368 | add %rdi, %rax |
369 | add %rsi, %rax |
370 | ret |
371 | |
372 | .p2align 4 |
373 | L(length_less16_part2_return): |
374 | bsr %eax, %eax |
375 | lea 16(%rax, %rdi), %rax |
376 | ret |
377 | |
378 | END (__memrchr) |
379 | weak_alias (__memrchr, memrchr) |
380 | |