1 | /* strstr with unaligned loads |
2 | Copyright (C) 2009-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include "../strchr-isa-default-impl.h" |
21 | |
22 | ENTRY(__strstr_sse2_unaligned) |
23 | movzbl (%rsi), %eax |
24 | testb %al, %al |
25 | je L(empty) |
26 | movzbl 1(%rsi), %edx |
27 | testb %dl, %dl |
28 | je L(strchr) |
29 | movd %eax, %xmm1 |
30 | movd %edx, %xmm2 |
31 | movq %rdi, %rax |
32 | andl $4095, %eax |
33 | punpcklbw %xmm1, %xmm1 |
34 | cmpq $4031, %rax |
35 | punpcklbw %xmm2, %xmm2 |
36 | punpcklwd %xmm1, %xmm1 |
37 | punpcklwd %xmm2, %xmm2 |
38 | pshufd $0, %xmm1, %xmm1 |
39 | pshufd $0, %xmm2, %xmm2 |
40 | ja L(cross_page) |
41 | movdqu (%rdi), %xmm3 |
42 | pxor %xmm5, %xmm5 |
43 | movdqu 1(%rdi), %xmm4 |
44 | movdqa %xmm3, %xmm6 |
45 | pcmpeqb %xmm1, %xmm3 |
46 | pcmpeqb %xmm2, %xmm4 |
47 | movdqu 16(%rdi), %xmm0 |
48 | pcmpeqb %xmm5, %xmm6 |
49 | pminub %xmm4, %xmm3 |
50 | movdqa %xmm3, %xmm4 |
51 | movdqu 17(%rdi), %xmm3 |
52 | pcmpeqb %xmm0, %xmm5 |
53 | pcmpeqb %xmm2, %xmm3 |
54 | por %xmm6, %xmm4 |
55 | pcmpeqb %xmm1, %xmm0 |
56 | pminub %xmm3, %xmm0 |
57 | por %xmm5, %xmm0 |
58 | pmovmskb %xmm4, %r8d |
59 | pmovmskb %xmm0, %eax |
60 | salq $16, %rax |
61 | orq %rax, %r8 |
62 | je L(next_32_bytes) |
63 | L(next_pair_index): |
64 | bsf %r8, %rax |
65 | addq %rdi, %rax |
66 | cmpb $0, (%rax) |
67 | je L(zero1) |
68 | movzbl 2(%rsi), %edx |
69 | testb %dl, %dl |
70 | je L(found1) |
71 | cmpb 2(%rax), %dl |
72 | jne L(next_pair) |
73 | xorl %edx, %edx |
74 | jmp L(pair_loop_start) |
75 | |
76 | .p2align 4 |
77 | L(strchr): |
78 | movzbl %al, %esi |
79 | jmp DEFAULT_STRCHR |
80 | |
81 | .p2align 4 |
82 | L(pair_loop): |
83 | addq $1, %rdx |
84 | cmpb 2(%rax,%rdx), %cl |
85 | jne L(next_pair) |
86 | L(pair_loop_start): |
87 | movzbl 3(%rsi,%rdx), %ecx |
88 | testb %cl, %cl |
89 | jne L(pair_loop) |
90 | L(found1): |
91 | ret |
92 | L(zero1): |
93 | xorl %eax, %eax |
94 | ret |
95 | |
96 | .p2align 4 |
97 | L(next_pair): |
98 | leaq -1(%r8), %rax |
99 | andq %rax, %r8 |
100 | jne L(next_pair_index) |
101 | |
102 | .p2align 4 |
103 | L(next_32_bytes): |
104 | movdqu 32(%rdi), %xmm3 |
105 | pxor %xmm5, %xmm5 |
106 | movdqu 33(%rdi), %xmm4 |
107 | movdqa %xmm3, %xmm6 |
108 | pcmpeqb %xmm1, %xmm3 |
109 | pcmpeqb %xmm2, %xmm4 |
110 | movdqu 48(%rdi), %xmm0 |
111 | pcmpeqb %xmm5, %xmm6 |
112 | pminub %xmm4, %xmm3 |
113 | movdqa %xmm3, %xmm4 |
114 | movdqu 49(%rdi), %xmm3 |
115 | pcmpeqb %xmm0, %xmm5 |
116 | pcmpeqb %xmm2, %xmm3 |
117 | por %xmm6, %xmm4 |
118 | pcmpeqb %xmm1, %xmm0 |
119 | pminub %xmm3, %xmm0 |
120 | por %xmm5, %xmm0 |
121 | pmovmskb %xmm4, %eax |
122 | salq $32, %rax |
123 | pmovmskb %xmm0, %r8d |
124 | salq $48, %r8 |
125 | orq %rax, %r8 |
126 | je L(loop_header) |
127 | L(next_pair2_index): |
128 | bsfq %r8, %rax |
129 | addq %rdi, %rax |
130 | cmpb $0, (%rax) |
131 | je L(zero2) |
132 | movzbl 2(%rsi), %edx |
133 | testb %dl, %dl |
134 | je L(found2) |
135 | cmpb 2(%rax), %dl |
136 | jne L(next_pair2) |
137 | xorl %edx, %edx |
138 | jmp L(pair_loop2_start) |
139 | |
140 | .p2align 4 |
141 | L(pair_loop2): |
142 | addq $1, %rdx |
143 | cmpb 2(%rax,%rdx), %cl |
144 | jne L(next_pair2) |
145 | L(pair_loop2_start): |
146 | movzbl 3(%rsi,%rdx), %ecx |
147 | testb %cl, %cl |
148 | jne L(pair_loop2) |
149 | L(found2): |
150 | ret |
151 | L(zero2): |
152 | xorl %eax, %eax |
153 | ret |
154 | L(empty): |
155 | mov %rdi, %rax |
156 | ret |
157 | |
158 | .p2align 4 |
159 | L(next_pair2): |
160 | leaq -1(%r8), %rax |
161 | andq %rax, %r8 |
162 | jne L(next_pair2_index) |
163 | L(loop_header): |
164 | movq $-512, %r11 |
165 | movq %rdi, %r9 |
166 | |
167 | pxor %xmm7, %xmm7 |
168 | andq $-64, %rdi |
169 | |
170 | .p2align 4 |
171 | L(loop): |
172 | movdqa 64(%rdi), %xmm3 |
173 | movdqu 63(%rdi), %xmm6 |
174 | movdqa %xmm3, %xmm0 |
175 | pxor %xmm2, %xmm3 |
176 | pxor %xmm1, %xmm6 |
177 | movdqa 80(%rdi), %xmm10 |
178 | por %xmm3, %xmm6 |
179 | pminub %xmm10, %xmm0 |
180 | movdqu 79(%rdi), %xmm3 |
181 | pxor %xmm2, %xmm10 |
182 | pxor %xmm1, %xmm3 |
183 | movdqa 96(%rdi), %xmm9 |
184 | por %xmm10, %xmm3 |
185 | pminub %xmm9, %xmm0 |
186 | pxor %xmm2, %xmm9 |
187 | movdqa 112(%rdi), %xmm8 |
188 | addq $64, %rdi |
189 | pminub %xmm6, %xmm3 |
190 | movdqu 31(%rdi), %xmm4 |
191 | pminub %xmm8, %xmm0 |
192 | pxor %xmm2, %xmm8 |
193 | pxor %xmm1, %xmm4 |
194 | por %xmm9, %xmm4 |
195 | pminub %xmm4, %xmm3 |
196 | movdqu 47(%rdi), %xmm5 |
197 | pxor %xmm1, %xmm5 |
198 | por %xmm8, %xmm5 |
199 | pminub %xmm5, %xmm3 |
200 | pminub %xmm3, %xmm0 |
201 | pcmpeqb %xmm7, %xmm0 |
202 | pmovmskb %xmm0, %eax |
203 | testl %eax, %eax |
204 | je L(loop) |
205 | pminub (%rdi), %xmm6 |
206 | pminub 32(%rdi),%xmm4 |
207 | pminub 48(%rdi),%xmm5 |
208 | pcmpeqb %xmm7, %xmm6 |
209 | pcmpeqb %xmm7, %xmm5 |
210 | pmovmskb %xmm6, %edx |
211 | movdqa 16(%rdi), %xmm8 |
212 | pcmpeqb %xmm7, %xmm4 |
213 | movdqu 15(%rdi), %xmm0 |
214 | pmovmskb %xmm5, %r8d |
215 | movdqa %xmm8, %xmm3 |
216 | pmovmskb %xmm4, %ecx |
217 | pcmpeqb %xmm1,%xmm0 |
218 | pcmpeqb %xmm2,%xmm3 |
219 | salq $32, %rcx |
220 | pcmpeqb %xmm7,%xmm8 |
221 | salq $48, %r8 |
222 | pminub %xmm0,%xmm3 |
223 | orq %rcx, %rdx |
224 | por %xmm3,%xmm8 |
225 | orq %rdx, %r8 |
226 | pmovmskb %xmm8, %eax |
227 | salq $16, %rax |
228 | orq %rax, %r8 |
229 | je L(loop) |
230 | L(next_pair_index3): |
231 | bsfq %r8, %rcx |
232 | addq %rdi, %rcx |
233 | cmpb $0, (%rcx) |
234 | je L(zero) |
235 | xorl %eax, %eax |
236 | movzbl 2(%rsi), %edx |
237 | testb %dl, %dl |
238 | je L(success3) |
239 | cmpb 1(%rcx), %dl |
240 | jne L(next_pair3) |
241 | jmp L(pair_loop_start3) |
242 | |
243 | .p2align 4 |
244 | L(pair_loop3): |
245 | addq $1, %rax |
246 | cmpb 1(%rcx,%rax), %dl |
247 | jne L(next_pair3) |
248 | L(pair_loop_start3): |
249 | movzbl 3(%rsi,%rax), %edx |
250 | testb %dl, %dl |
251 | jne L(pair_loop3) |
252 | L(success3): |
253 | lea -1(%rcx), %rax |
254 | ret |
255 | |
256 | .p2align 4 |
257 | L(next_pair3): |
258 | addq %rax, %r11 |
259 | movq %rdi, %rax |
260 | subq %r9, %rax |
261 | cmpq %r11, %rax |
262 | jl L(switch_strstr) |
263 | leaq -1(%r8), %rax |
264 | andq %rax, %r8 |
265 | jne L(next_pair_index3) |
266 | jmp L(loop) |
267 | |
268 | .p2align 4 |
269 | L(switch_strstr): |
270 | movq %rdi, %rdi |
271 | jmp __strstr_generic |
272 | |
273 | .p2align 4 |
274 | L(cross_page): |
275 | |
276 | movq %rdi, %rax |
277 | pxor %xmm0, %xmm0 |
278 | andq $-64, %rax |
279 | movdqa (%rax), %xmm3 |
280 | movdqu -1(%rax), %xmm4 |
281 | movdqa %xmm3, %xmm8 |
282 | movdqa 16(%rax), %xmm5 |
283 | pcmpeqb %xmm1, %xmm4 |
284 | pcmpeqb %xmm0, %xmm8 |
285 | pcmpeqb %xmm2, %xmm3 |
286 | movdqa %xmm5, %xmm7 |
287 | pminub %xmm4, %xmm3 |
288 | movdqu 15(%rax), %xmm4 |
289 | pcmpeqb %xmm0, %xmm7 |
290 | por %xmm3, %xmm8 |
291 | movdqa %xmm5, %xmm3 |
292 | movdqa 32(%rax), %xmm5 |
293 | pcmpeqb %xmm1, %xmm4 |
294 | pcmpeqb %xmm2, %xmm3 |
295 | movdqa %xmm5, %xmm6 |
296 | pmovmskb %xmm8, %ecx |
297 | pminub %xmm4, %xmm3 |
298 | movdqu 31(%rax), %xmm4 |
299 | por %xmm3, %xmm7 |
300 | movdqa %xmm5, %xmm3 |
301 | pcmpeqb %xmm0, %xmm6 |
302 | movdqa 48(%rax), %xmm5 |
303 | pcmpeqb %xmm1, %xmm4 |
304 | pmovmskb %xmm7, %r8d |
305 | pcmpeqb %xmm2, %xmm3 |
306 | pcmpeqb %xmm5, %xmm0 |
307 | pminub %xmm4, %xmm3 |
308 | movdqu 47(%rax), %xmm4 |
309 | por %xmm3, %xmm6 |
310 | movdqa %xmm5, %xmm3 |
311 | salq $16, %r8 |
312 | pcmpeqb %xmm1, %xmm4 |
313 | pcmpeqb %xmm2, %xmm3 |
314 | pmovmskb %xmm6, %r10d |
315 | pminub %xmm4, %xmm3 |
316 | por %xmm3, %xmm0 |
317 | salq $32, %r10 |
318 | orq %r10, %r8 |
319 | orq %rcx, %r8 |
320 | movl %edi, %ecx |
321 | pmovmskb %xmm0, %edx |
322 | subl %eax, %ecx |
323 | salq $48, %rdx |
324 | orq %rdx, %r8 |
325 | shrq %cl, %r8 |
326 | je L(loop_header) |
327 | L(next_pair_index4): |
328 | bsfq %r8, %rax |
329 | addq %rdi, %rax |
330 | cmpb $0, (%rax) |
331 | je L(zero) |
332 | |
333 | cmpq %rax,%rdi |
334 | je L(next_pair4) |
335 | |
336 | movzbl 2(%rsi), %edx |
337 | testb %dl, %dl |
338 | je L(found3) |
339 | cmpb 1(%rax), %dl |
340 | jne L(next_pair4) |
341 | xorl %edx, %edx |
342 | jmp L(pair_loop_start4) |
343 | |
344 | .p2align 4 |
345 | L(pair_loop4): |
346 | addq $1, %rdx |
347 | cmpb 1(%rax,%rdx), %cl |
348 | jne L(next_pair4) |
349 | L(pair_loop_start4): |
350 | movzbl 3(%rsi,%rdx), %ecx |
351 | testb %cl, %cl |
352 | jne L(pair_loop4) |
353 | L(found3): |
354 | subq $1, %rax |
355 | ret |
356 | |
357 | .p2align 4 |
358 | L(next_pair4): |
359 | leaq -1(%r8), %rax |
360 | andq %rax, %r8 |
361 | jne L(next_pair_index4) |
362 | jmp L(loop_header) |
363 | |
364 | .p2align 4 |
365 | L(found): |
366 | rep |
367 | ret |
368 | |
369 | .p2align 4 |
370 | L(zero): |
371 | xorl %eax, %eax |
372 | ret |
373 | |
374 | |
375 | END(__strstr_sse2_unaligned) |
376 | |