1 | /* wcscmp optimized with SSE2. |
2 | Copyright (C) 2018-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | /* ISA level >= 2 because there is no wcscmp-sse4 implementations. */ |
22 | #if ISA_SHOULD_BUILD (2) |
23 | # include <sysdep.h> |
24 | |
25 | /* Needed to get right name. */ |
26 | # define USE_AS_WCSCMP |
27 | # define STRCMP_ISA _sse2 |
28 | # include "strcmp-naming.h" |
29 | |
30 | /* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */ |
31 | |
32 | .text |
33 | ENTRY (STRCMP) |
34 | /* |
35 | * This implementation uses SSE to compare up to 16 bytes at a time. |
36 | */ |
37 | mov %esi, %eax |
38 | mov %edi, %edx |
39 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ |
40 | mov %al, %ch |
41 | mov %dl, %cl |
42 | and $63, %eax /* rsi alignment in cache line */ |
43 | and $63, %edx /* rdi alignment in cache line */ |
44 | and $15, %cl |
45 | jz L(continue_00) |
46 | cmp $16, %edx |
47 | jb L(continue_0) |
48 | cmp $32, %edx |
49 | jb L(continue_16) |
50 | cmp $48, %edx |
51 | jb L(continue_32) |
52 | |
53 | L(continue_48): |
54 | and $15, %ch |
55 | jz L(continue_48_00) |
56 | cmp $16, %eax |
57 | jb L(continue_0_48) |
58 | cmp $32, %eax |
59 | jb L(continue_16_48) |
60 | cmp $48, %eax |
61 | jb L(continue_32_48) |
62 | |
63 | .p2align 4 |
64 | L(continue_48_48): |
65 | mov (%rsi), %ecx |
66 | cmp %ecx, (%rdi) |
67 | jne L(nequal) |
68 | test %ecx, %ecx |
69 | jz L(equal) |
70 | |
71 | mov 4(%rsi), %ecx |
72 | cmp %ecx, 4(%rdi) |
73 | jne L(nequal) |
74 | test %ecx, %ecx |
75 | jz L(equal) |
76 | |
77 | mov 8(%rsi), %ecx |
78 | cmp %ecx, 8(%rdi) |
79 | jne L(nequal) |
80 | test %ecx, %ecx |
81 | jz L(equal) |
82 | |
83 | mov 12(%rsi), %ecx |
84 | cmp %ecx, 12(%rdi) |
85 | jne L(nequal) |
86 | test %ecx, %ecx |
87 | jz L(equal) |
88 | |
89 | movdqu 16(%rdi), %xmm1 |
90 | movdqu 16(%rsi), %xmm2 |
91 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
92 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
93 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
94 | pmovmskb %xmm1, %edx |
95 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
96 | jnz L(less4_double_words_16) |
97 | |
98 | movdqu 32(%rdi), %xmm1 |
99 | movdqu 32(%rsi), %xmm2 |
100 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
101 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
102 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
103 | pmovmskb %xmm1, %edx |
104 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
105 | jnz L(less4_double_words_32) |
106 | |
107 | movdqu 48(%rdi), %xmm1 |
108 | movdqu 48(%rsi), %xmm2 |
109 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
110 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
111 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
112 | pmovmskb %xmm1, %edx |
113 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
114 | jnz L(less4_double_words_48) |
115 | |
116 | add $64, %rsi |
117 | add $64, %rdi |
118 | jmp L(continue_48_48) |
119 | |
120 | L(continue_0): |
121 | and $15, %ch |
122 | jz L(continue_0_00) |
123 | cmp $16, %eax |
124 | jb L(continue_0_0) |
125 | cmp $32, %eax |
126 | jb L(continue_0_16) |
127 | cmp $48, %eax |
128 | jb L(continue_0_32) |
129 | |
130 | .p2align 4 |
131 | L(continue_0_48): |
132 | mov (%rsi), %ecx |
133 | cmp %ecx, (%rdi) |
134 | jne L(nequal) |
135 | test %ecx, %ecx |
136 | jz L(equal) |
137 | |
138 | mov 4(%rsi), %ecx |
139 | cmp %ecx, 4(%rdi) |
140 | jne L(nequal) |
141 | test %ecx, %ecx |
142 | jz L(equal) |
143 | |
144 | mov 8(%rsi), %ecx |
145 | cmp %ecx, 8(%rdi) |
146 | jne L(nequal) |
147 | test %ecx, %ecx |
148 | jz L(equal) |
149 | |
150 | mov 12(%rsi), %ecx |
151 | cmp %ecx, 12(%rdi) |
152 | jne L(nequal) |
153 | test %ecx, %ecx |
154 | jz L(equal) |
155 | |
156 | movdqu 16(%rdi), %xmm1 |
157 | movdqu 16(%rsi), %xmm2 |
158 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
159 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
160 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
161 | pmovmskb %xmm1, %edx |
162 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
163 | jnz L(less4_double_words_16) |
164 | |
165 | movdqu 32(%rdi), %xmm1 |
166 | movdqu 32(%rsi), %xmm2 |
167 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
168 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
169 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
170 | pmovmskb %xmm1, %edx |
171 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
172 | jnz L(less4_double_words_32) |
173 | |
174 | mov 48(%rsi), %ecx |
175 | cmp %ecx, 48(%rdi) |
176 | jne L(nequal) |
177 | test %ecx, %ecx |
178 | jz L(equal) |
179 | |
180 | mov 52(%rsi), %ecx |
181 | cmp %ecx, 52(%rdi) |
182 | jne L(nequal) |
183 | test %ecx, %ecx |
184 | jz L(equal) |
185 | |
186 | mov 56(%rsi), %ecx |
187 | cmp %ecx, 56(%rdi) |
188 | jne L(nequal) |
189 | test %ecx, %ecx |
190 | jz L(equal) |
191 | |
192 | mov 60(%rsi), %ecx |
193 | cmp %ecx, 60(%rdi) |
194 | jne L(nequal) |
195 | test %ecx, %ecx |
196 | jz L(equal) |
197 | |
198 | add $64, %rsi |
199 | add $64, %rdi |
200 | jmp L(continue_0_48) |
201 | |
202 | .p2align 4 |
203 | L(continue_00): |
204 | and $15, %ch |
205 | jz L(continue_00_00) |
206 | cmp $16, %eax |
207 | jb L(continue_00_0) |
208 | cmp $32, %eax |
209 | jb L(continue_00_16) |
210 | cmp $48, %eax |
211 | jb L(continue_00_32) |
212 | |
213 | .p2align 4 |
214 | L(continue_00_48): |
215 | pcmpeqd (%rdi), %xmm0 |
216 | mov (%rdi), %eax |
217 | pmovmskb %xmm0, %ecx |
218 | test %ecx, %ecx |
219 | jnz L(less4_double_words1) |
220 | |
221 | cmp (%rsi), %eax |
222 | jne L(nequal) |
223 | |
224 | mov 4(%rdi), %eax |
225 | cmp 4(%rsi), %eax |
226 | jne L(nequal) |
227 | |
228 | mov 8(%rdi), %eax |
229 | cmp 8(%rsi), %eax |
230 | jne L(nequal) |
231 | |
232 | mov 12(%rdi), %eax |
233 | cmp 12(%rsi), %eax |
234 | jne L(nequal) |
235 | |
236 | movdqu 16(%rsi), %xmm2 |
237 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
238 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
239 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
240 | pmovmskb %xmm2, %edx |
241 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
242 | jnz L(less4_double_words_16) |
243 | |
244 | movdqu 32(%rsi), %xmm2 |
245 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
246 | pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
247 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
248 | pmovmskb %xmm2, %edx |
249 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
250 | jnz L(less4_double_words_32) |
251 | |
252 | movdqu 48(%rsi), %xmm2 |
253 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
254 | pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
255 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
256 | pmovmskb %xmm2, %edx |
257 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
258 | jnz L(less4_double_words_48) |
259 | |
260 | add $64, %rsi |
261 | add $64, %rdi |
262 | jmp L(continue_00_48) |
263 | |
264 | .p2align 4 |
265 | L(continue_32): |
266 | and $15, %ch |
267 | jz L(continue_32_00) |
268 | cmp $16, %eax |
269 | jb L(continue_0_32) |
270 | cmp $32, %eax |
271 | jb L(continue_16_32) |
272 | cmp $48, %eax |
273 | jb L(continue_32_32) |
274 | |
275 | .p2align 4 |
276 | L(continue_32_48): |
277 | mov (%rsi), %ecx |
278 | cmp %ecx, (%rdi) |
279 | jne L(nequal) |
280 | test %ecx, %ecx |
281 | jz L(equal) |
282 | |
283 | mov 4(%rsi), %ecx |
284 | cmp %ecx, 4(%rdi) |
285 | jne L(nequal) |
286 | test %ecx, %ecx |
287 | jz L(equal) |
288 | |
289 | mov 8(%rsi), %ecx |
290 | cmp %ecx, 8(%rdi) |
291 | jne L(nequal) |
292 | test %ecx, %ecx |
293 | jz L(equal) |
294 | |
295 | mov 12(%rsi), %ecx |
296 | cmp %ecx, 12(%rdi) |
297 | jne L(nequal) |
298 | test %ecx, %ecx |
299 | jz L(equal) |
300 | |
301 | mov 16(%rsi), %ecx |
302 | cmp %ecx, 16(%rdi) |
303 | jne L(nequal) |
304 | test %ecx, %ecx |
305 | jz L(equal) |
306 | |
307 | mov 20(%rsi), %ecx |
308 | cmp %ecx, 20(%rdi) |
309 | jne L(nequal) |
310 | test %ecx, %ecx |
311 | jz L(equal) |
312 | |
313 | mov 24(%rsi), %ecx |
314 | cmp %ecx, 24(%rdi) |
315 | jne L(nequal) |
316 | test %ecx, %ecx |
317 | jz L(equal) |
318 | |
319 | mov 28(%rsi), %ecx |
320 | cmp %ecx, 28(%rdi) |
321 | jne L(nequal) |
322 | test %ecx, %ecx |
323 | jz L(equal) |
324 | |
325 | movdqu 32(%rdi), %xmm1 |
326 | movdqu 32(%rsi), %xmm2 |
327 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
328 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
329 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
330 | pmovmskb %xmm1, %edx |
331 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
332 | jnz L(less4_double_words_32) |
333 | |
334 | movdqu 48(%rdi), %xmm1 |
335 | movdqu 48(%rsi), %xmm2 |
336 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
337 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
338 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
339 | pmovmskb %xmm1, %edx |
340 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
341 | jnz L(less4_double_words_48) |
342 | |
343 | add $64, %rsi |
344 | add $64, %rdi |
345 | jmp L(continue_32_48) |
346 | |
347 | .p2align 4 |
348 | L(continue_16): |
349 | and $15, %ch |
350 | jz L(continue_16_00) |
351 | cmp $16, %eax |
352 | jb L(continue_0_16) |
353 | cmp $32, %eax |
354 | jb L(continue_16_16) |
355 | cmp $48, %eax |
356 | jb L(continue_16_32) |
357 | |
358 | .p2align 4 |
359 | L(continue_16_48): |
360 | mov (%rsi), %ecx |
361 | cmp %ecx, (%rdi) |
362 | jne L(nequal) |
363 | test %ecx, %ecx |
364 | jz L(equal) |
365 | |
366 | mov 4(%rsi), %ecx |
367 | cmp %ecx, 4(%rdi) |
368 | jne L(nequal) |
369 | test %ecx, %ecx |
370 | jz L(equal) |
371 | |
372 | mov 8(%rsi), %ecx |
373 | cmp %ecx, 8(%rdi) |
374 | jne L(nequal) |
375 | test %ecx, %ecx |
376 | jz L(equal) |
377 | |
378 | mov 12(%rsi), %ecx |
379 | cmp %ecx, 12(%rdi) |
380 | jne L(nequal) |
381 | test %ecx, %ecx |
382 | jz L(equal) |
383 | |
384 | movdqu 16(%rdi), %xmm1 |
385 | movdqu 16(%rsi), %xmm2 |
386 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
387 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
388 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
389 | pmovmskb %xmm1, %edx |
390 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
391 | jnz L(less4_double_words_16) |
392 | |
393 | mov 32(%rsi), %ecx |
394 | cmp %ecx, 32(%rdi) |
395 | jne L(nequal) |
396 | test %ecx, %ecx |
397 | jz L(equal) |
398 | |
399 | mov 36(%rsi), %ecx |
400 | cmp %ecx, 36(%rdi) |
401 | jne L(nequal) |
402 | test %ecx, %ecx |
403 | jz L(equal) |
404 | |
405 | mov 40(%rsi), %ecx |
406 | cmp %ecx, 40(%rdi) |
407 | jne L(nequal) |
408 | test %ecx, %ecx |
409 | jz L(equal) |
410 | |
411 | mov 44(%rsi), %ecx |
412 | cmp %ecx, 44(%rdi) |
413 | jne L(nequal) |
414 | test %ecx, %ecx |
415 | jz L(equal) |
416 | |
417 | movdqu 48(%rdi), %xmm1 |
418 | movdqu 48(%rsi), %xmm2 |
419 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
420 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
421 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
422 | pmovmskb %xmm1, %edx |
423 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
424 | jnz L(less4_double_words_48) |
425 | |
426 | add $64, %rsi |
427 | add $64, %rdi |
428 | jmp L(continue_16_48) |
429 | |
430 | .p2align 4 |
431 | L(continue_00_00): |
432 | movdqa (%rdi), %xmm1 |
433 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
434 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
435 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
436 | pmovmskb %xmm1, %edx |
437 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
438 | jnz L(less4_double_words) |
439 | |
440 | movdqa 16(%rdi), %xmm3 |
441 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
442 | pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */ |
443 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
444 | pmovmskb %xmm3, %edx |
445 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
446 | jnz L(less4_double_words_16) |
447 | |
448 | movdqa 32(%rdi), %xmm5 |
449 | pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ |
450 | pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */ |
451 | psubb %xmm0, %xmm5 /* packed sub of comparison results*/ |
452 | pmovmskb %xmm5, %edx |
453 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
454 | jnz L(less4_double_words_32) |
455 | |
456 | movdqa 48(%rdi), %xmm1 |
457 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
458 | pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
459 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
460 | pmovmskb %xmm1, %edx |
461 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
462 | jnz L(less4_double_words_48) |
463 | |
464 | add $64, %rsi |
465 | add $64, %rdi |
466 | jmp L(continue_00_00) |
467 | |
468 | .p2align 4 |
469 | L(continue_00_32): |
470 | movdqu (%rsi), %xmm2 |
471 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
472 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
473 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
474 | pmovmskb %xmm2, %edx |
475 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
476 | jnz L(less4_double_words) |
477 | |
478 | add $16, %rsi |
479 | add $16, %rdi |
480 | jmp L(continue_00_48) |
481 | |
482 | .p2align 4 |
483 | L(continue_00_16): |
484 | movdqu (%rsi), %xmm2 |
485 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
486 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
487 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
488 | pmovmskb %xmm2, %edx |
489 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
490 | jnz L(less4_double_words) |
491 | |
492 | movdqu 16(%rsi), %xmm2 |
493 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
494 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
495 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
496 | pmovmskb %xmm2, %edx |
497 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
498 | jnz L(less4_double_words_16) |
499 | |
500 | add $32, %rsi |
501 | add $32, %rdi |
502 | jmp L(continue_00_48) |
503 | |
504 | .p2align 4 |
505 | L(continue_00_0): |
506 | movdqu (%rsi), %xmm2 |
507 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
508 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
509 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
510 | pmovmskb %xmm2, %edx |
511 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
512 | jnz L(less4_double_words) |
513 | |
514 | movdqu 16(%rsi), %xmm2 |
515 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
516 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
517 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
518 | pmovmskb %xmm2, %edx |
519 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
520 | jnz L(less4_double_words_16) |
521 | |
522 | movdqu 32(%rsi), %xmm2 |
523 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
524 | pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
525 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
526 | pmovmskb %xmm2, %edx |
527 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
528 | jnz L(less4_double_words_32) |
529 | |
530 | add $48, %rsi |
531 | add $48, %rdi |
532 | jmp L(continue_00_48) |
533 | |
534 | .p2align 4 |
535 | L(continue_48_00): |
536 | pcmpeqd (%rsi), %xmm0 |
537 | mov (%rdi), %eax |
538 | pmovmskb %xmm0, %ecx |
539 | test %ecx, %ecx |
540 | jnz L(less4_double_words1) |
541 | |
542 | cmp (%rsi), %eax |
543 | jne L(nequal) |
544 | |
545 | mov 4(%rdi), %eax |
546 | cmp 4(%rsi), %eax |
547 | jne L(nequal) |
548 | |
549 | mov 8(%rdi), %eax |
550 | cmp 8(%rsi), %eax |
551 | jne L(nequal) |
552 | |
553 | mov 12(%rdi), %eax |
554 | cmp 12(%rsi), %eax |
555 | jne L(nequal) |
556 | |
557 | movdqu 16(%rdi), %xmm1 |
558 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
559 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
560 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
561 | pmovmskb %xmm1, %edx |
562 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
563 | jnz L(less4_double_words_16) |
564 | |
565 | movdqu 32(%rdi), %xmm1 |
566 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
567 | pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
568 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
569 | pmovmskb %xmm1, %edx |
570 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
571 | jnz L(less4_double_words_32) |
572 | |
573 | movdqu 48(%rdi), %xmm1 |
574 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
575 | pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
576 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
577 | pmovmskb %xmm1, %edx |
578 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
579 | jnz L(less4_double_words_48) |
580 | |
581 | add $64, %rsi |
582 | add $64, %rdi |
583 | jmp L(continue_48_00) |
584 | |
585 | .p2align 4 |
586 | L(continue_32_00): |
587 | movdqu (%rdi), %xmm1 |
588 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
589 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
590 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
591 | pmovmskb %xmm1, %edx |
592 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
593 | jnz L(less4_double_words) |
594 | |
595 | add $16, %rsi |
596 | add $16, %rdi |
597 | jmp L(continue_48_00) |
598 | |
599 | .p2align 4 |
600 | L(continue_16_00): |
601 | movdqu (%rdi), %xmm1 |
602 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
603 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
604 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
605 | pmovmskb %xmm1, %edx |
606 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
607 | jnz L(less4_double_words) |
608 | |
609 | movdqu 16(%rdi), %xmm1 |
610 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
611 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
612 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
613 | pmovmskb %xmm1, %edx |
614 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
615 | jnz L(less4_double_words_16) |
616 | |
617 | add $32, %rsi |
618 | add $32, %rdi |
619 | jmp L(continue_48_00) |
620 | |
621 | .p2align 4 |
622 | L(continue_0_00): |
623 | movdqu (%rdi), %xmm1 |
624 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
625 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
626 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
627 | pmovmskb %xmm1, %edx |
628 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
629 | jnz L(less4_double_words) |
630 | |
631 | movdqu 16(%rdi), %xmm1 |
632 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
633 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
634 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
635 | pmovmskb %xmm1, %edx |
636 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
637 | jnz L(less4_double_words_16) |
638 | |
639 | movdqu 32(%rdi), %xmm1 |
640 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
641 | pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
642 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
643 | pmovmskb %xmm1, %edx |
644 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
645 | jnz L(less4_double_words_32) |
646 | |
647 | add $48, %rsi |
648 | add $48, %rdi |
649 | jmp L(continue_48_00) |
650 | |
651 | .p2align 4 |
652 | L(continue_32_32): |
653 | movdqu (%rdi), %xmm1 |
654 | movdqu (%rsi), %xmm2 |
655 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
656 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
657 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
658 | pmovmskb %xmm1, %edx |
659 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
660 | jnz L(less4_double_words) |
661 | |
662 | add $16, %rsi |
663 | add $16, %rdi |
664 | jmp L(continue_48_48) |
665 | |
666 | .p2align 4 |
667 | L(continue_16_16): |
668 | movdqu (%rdi), %xmm1 |
669 | movdqu (%rsi), %xmm2 |
670 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
671 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
672 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
673 | pmovmskb %xmm1, %edx |
674 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
675 | jnz L(less4_double_words) |
676 | |
677 | movdqu 16(%rdi), %xmm3 |
678 | movdqu 16(%rsi), %xmm4 |
679 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
680 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
681 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
682 | pmovmskb %xmm3, %edx |
683 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
684 | jnz L(less4_double_words_16) |
685 | |
686 | add $32, %rsi |
687 | add $32, %rdi |
688 | jmp L(continue_48_48) |
689 | |
690 | .p2align 4 |
691 | L(continue_0_0): |
692 | movdqu (%rdi), %xmm1 |
693 | movdqu (%rsi), %xmm2 |
694 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
695 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
696 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
697 | pmovmskb %xmm1, %edx |
698 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
699 | jnz L(less4_double_words) |
700 | |
701 | movdqu 16(%rdi), %xmm3 |
702 | movdqu 16(%rsi), %xmm4 |
703 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
704 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
705 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
706 | pmovmskb %xmm3, %edx |
707 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
708 | jnz L(less4_double_words_16) |
709 | |
710 | movdqu 32(%rdi), %xmm1 |
711 | movdqu 32(%rsi), %xmm2 |
712 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
713 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
714 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
715 | pmovmskb %xmm1, %edx |
716 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
717 | jnz L(less4_double_words_32) |
718 | |
719 | add $48, %rsi |
720 | add $48, %rdi |
721 | jmp L(continue_48_48) |
722 | |
723 | .p2align 4 |
724 | L(continue_0_16): |
725 | movdqu (%rdi), %xmm1 |
726 | movdqu (%rsi), %xmm2 |
727 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
728 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
729 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
730 | pmovmskb %xmm1, %edx |
731 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
732 | jnz L(less4_double_words) |
733 | |
734 | movdqu 16(%rdi), %xmm1 |
735 | movdqu 16(%rsi), %xmm2 |
736 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
737 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
738 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
739 | pmovmskb %xmm1, %edx |
740 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
741 | jnz L(less4_double_words_16) |
742 | |
743 | add $32, %rsi |
744 | add $32, %rdi |
745 | jmp L(continue_32_48) |
746 | |
747 | .p2align 4 |
748 | L(continue_0_32): |
749 | movdqu (%rdi), %xmm1 |
750 | movdqu (%rsi), %xmm2 |
751 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
752 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
753 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
754 | pmovmskb %xmm1, %edx |
755 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
756 | jnz L(less4_double_words) |
757 | |
758 | add $16, %rsi |
759 | add $16, %rdi |
760 | jmp L(continue_16_48) |
761 | |
762 | .p2align 4 |
763 | L(continue_16_32): |
764 | movdqu (%rdi), %xmm1 |
765 | movdqu (%rsi), %xmm2 |
766 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
767 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
768 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
769 | pmovmskb %xmm1, %edx |
770 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
771 | jnz L(less4_double_words) |
772 | |
773 | add $16, %rsi |
774 | add $16, %rdi |
775 | jmp L(continue_32_48) |
776 | |
777 | .p2align 4 |
778 | L(less4_double_words1): |
779 | cmp (%rsi), %eax |
780 | jne L(nequal) |
781 | test %eax, %eax |
782 | jz L(equal) |
783 | |
784 | mov 4(%rsi), %ecx |
785 | cmp %ecx, 4(%rdi) |
786 | jne L(nequal) |
787 | test %ecx, %ecx |
788 | jz L(equal) |
789 | |
790 | mov 8(%rsi), %ecx |
791 | cmp %ecx, 8(%rdi) |
792 | jne L(nequal) |
793 | test %ecx, %ecx |
794 | jz L(equal) |
795 | |
796 | mov 12(%rsi), %ecx |
797 | cmp %ecx, 12(%rdi) |
798 | jne L(nequal) |
799 | xor %eax, %eax |
800 | ret |
801 | |
802 | .p2align 4 |
803 | L(less4_double_words): |
804 | xor %eax, %eax |
805 | test %dl, %dl |
806 | jz L(next_two_double_words) |
807 | and $15, %dl |
808 | jz L(second_double_word) |
809 | mov (%rdi), %eax |
810 | cmp (%rsi), %eax |
811 | jne L(nequal) |
812 | ret |
813 | |
814 | .p2align 4 |
815 | L(second_double_word): |
816 | mov 4(%rdi), %eax |
817 | cmp 4(%rsi), %eax |
818 | jne L(nequal) |
819 | ret |
820 | |
821 | .p2align 4 |
822 | L(next_two_double_words): |
823 | and $15, %dh |
824 | jz L(fourth_double_word) |
825 | mov 8(%rdi), %eax |
826 | cmp 8(%rsi), %eax |
827 | jne L(nequal) |
828 | ret |
829 | |
830 | .p2align 4 |
831 | L(fourth_double_word): |
832 | mov 12(%rdi), %eax |
833 | cmp 12(%rsi), %eax |
834 | jne L(nequal) |
835 | ret |
836 | |
837 | .p2align 4 |
838 | L(less4_double_words_16): |
839 | xor %eax, %eax |
840 | test %dl, %dl |
841 | jz L(next_two_double_words_16) |
842 | and $15, %dl |
843 | jz L(second_double_word_16) |
844 | mov 16(%rdi), %eax |
845 | cmp 16(%rsi), %eax |
846 | jne L(nequal) |
847 | ret |
848 | |
849 | .p2align 4 |
850 | L(second_double_word_16): |
851 | mov 20(%rdi), %eax |
852 | cmp 20(%rsi), %eax |
853 | jne L(nequal) |
854 | ret |
855 | |
856 | .p2align 4 |
857 | L(next_two_double_words_16): |
858 | and $15, %dh |
859 | jz L(fourth_double_word_16) |
860 | mov 24(%rdi), %eax |
861 | cmp 24(%rsi), %eax |
862 | jne L(nequal) |
863 | ret |
864 | |
865 | .p2align 4 |
866 | L(fourth_double_word_16): |
867 | mov 28(%rdi), %eax |
868 | cmp 28(%rsi), %eax |
869 | jne L(nequal) |
870 | ret |
871 | |
872 | .p2align 4 |
873 | L(less4_double_words_32): |
874 | xor %eax, %eax |
875 | test %dl, %dl |
876 | jz L(next_two_double_words_32) |
877 | and $15, %dl |
878 | jz L(second_double_word_32) |
879 | mov 32(%rdi), %eax |
880 | cmp 32(%rsi), %eax |
881 | jne L(nequal) |
882 | ret |
883 | |
884 | .p2align 4 |
885 | L(second_double_word_32): |
886 | mov 36(%rdi), %eax |
887 | cmp 36(%rsi), %eax |
888 | jne L(nequal) |
889 | ret |
890 | |
891 | .p2align 4 |
892 | L(next_two_double_words_32): |
893 | and $15, %dh |
894 | jz L(fourth_double_word_32) |
895 | mov 40(%rdi), %eax |
896 | cmp 40(%rsi), %eax |
897 | jne L(nequal) |
898 | ret |
899 | |
900 | .p2align 4 |
901 | L(fourth_double_word_32): |
902 | mov 44(%rdi), %eax |
903 | cmp 44(%rsi), %eax |
904 | jne L(nequal) |
905 | ret |
906 | |
907 | .p2align 4 |
908 | L(less4_double_words_48): |
909 | xor %eax, %eax |
910 | test %dl, %dl |
911 | jz L(next_two_double_words_48) |
912 | and $15, %dl |
913 | jz L(second_double_word_48) |
914 | mov 48(%rdi), %eax |
915 | cmp 48(%rsi), %eax |
916 | jne L(nequal) |
917 | ret |
918 | |
919 | .p2align 4 |
920 | L(second_double_word_48): |
921 | mov 52(%rdi), %eax |
922 | cmp 52(%rsi), %eax |
923 | jne L(nequal) |
924 | ret |
925 | |
926 | .p2align 4 |
927 | L(next_two_double_words_48): |
928 | and $15, %dh |
929 | jz L(fourth_double_word_48) |
930 | mov 56(%rdi), %eax |
931 | cmp 56(%rsi), %eax |
932 | jne L(nequal) |
933 | ret |
934 | |
935 | .p2align 4 |
936 | L(fourth_double_word_48): |
937 | mov 60(%rdi), %eax |
938 | cmp 60(%rsi), %eax |
939 | jne L(nequal) |
940 | ret |
941 | |
942 | .p2align 4 |
943 | L(nequal): |
944 | mov $1, %eax |
945 | jg L(nequal_bigger) |
946 | neg %eax |
947 | |
948 | L(nequal_bigger): |
949 | ret |
950 | |
951 | .p2align 4 |
952 | L(equal): |
953 | xor %rax, %rax |
954 | ret |
955 | |
956 | END (STRCMP) |
957 | #endif |
958 | |