1 | /* Optimized wcscmp for x86-64 with SSE2. |
2 | Copyright (C) 2011-2020 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | /* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */ |
23 | |
24 | .text |
25 | ENTRY (__wcscmp) |
26 | /* |
27 | * This implementation uses SSE to compare up to 16 bytes at a time. |
28 | */ |
29 | mov %esi, %eax |
30 | mov %edi, %edx |
31 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ |
32 | mov %al, %ch |
33 | mov %dl, %cl |
34 | and $63, %eax /* rsi alignment in cache line */ |
35 | and $63, %edx /* rdi alignment in cache line */ |
36 | and $15, %cl |
37 | jz L(continue_00) |
38 | cmp $16, %edx |
39 | jb L(continue_0) |
40 | cmp $32, %edx |
41 | jb L(continue_16) |
42 | cmp $48, %edx |
43 | jb L(continue_32) |
44 | |
45 | L(continue_48): |
46 | and $15, %ch |
47 | jz L(continue_48_00) |
48 | cmp $16, %eax |
49 | jb L(continue_0_48) |
50 | cmp $32, %eax |
51 | jb L(continue_16_48) |
52 | cmp $48, %eax |
53 | jb L(continue_32_48) |
54 | |
55 | .p2align 4 |
56 | L(continue_48_48): |
57 | mov (%rsi), %ecx |
58 | cmp %ecx, (%rdi) |
59 | jne L(nequal) |
60 | test %ecx, %ecx |
61 | jz L(equal) |
62 | |
63 | mov 4(%rsi), %ecx |
64 | cmp %ecx, 4(%rdi) |
65 | jne L(nequal) |
66 | test %ecx, %ecx |
67 | jz L(equal) |
68 | |
69 | mov 8(%rsi), %ecx |
70 | cmp %ecx, 8(%rdi) |
71 | jne L(nequal) |
72 | test %ecx, %ecx |
73 | jz L(equal) |
74 | |
75 | mov 12(%rsi), %ecx |
76 | cmp %ecx, 12(%rdi) |
77 | jne L(nequal) |
78 | test %ecx, %ecx |
79 | jz L(equal) |
80 | |
81 | movdqu 16(%rdi), %xmm1 |
82 | movdqu 16(%rsi), %xmm2 |
83 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
84 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
85 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
86 | pmovmskb %xmm1, %edx |
87 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
88 | jnz L(less4_double_words_16) |
89 | |
90 | movdqu 32(%rdi), %xmm1 |
91 | movdqu 32(%rsi), %xmm2 |
92 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
93 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
94 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
95 | pmovmskb %xmm1, %edx |
96 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
97 | jnz L(less4_double_words_32) |
98 | |
99 | movdqu 48(%rdi), %xmm1 |
100 | movdqu 48(%rsi), %xmm2 |
101 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
102 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
103 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
104 | pmovmskb %xmm1, %edx |
105 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
106 | jnz L(less4_double_words_48) |
107 | |
108 | add $64, %rsi |
109 | add $64, %rdi |
110 | jmp L(continue_48_48) |
111 | |
112 | L(continue_0): |
113 | and $15, %ch |
114 | jz L(continue_0_00) |
115 | cmp $16, %eax |
116 | jb L(continue_0_0) |
117 | cmp $32, %eax |
118 | jb L(continue_0_16) |
119 | cmp $48, %eax |
120 | jb L(continue_0_32) |
121 | |
122 | .p2align 4 |
123 | L(continue_0_48): |
124 | mov (%rsi), %ecx |
125 | cmp %ecx, (%rdi) |
126 | jne L(nequal) |
127 | test %ecx, %ecx |
128 | jz L(equal) |
129 | |
130 | mov 4(%rsi), %ecx |
131 | cmp %ecx, 4(%rdi) |
132 | jne L(nequal) |
133 | test %ecx, %ecx |
134 | jz L(equal) |
135 | |
136 | mov 8(%rsi), %ecx |
137 | cmp %ecx, 8(%rdi) |
138 | jne L(nequal) |
139 | test %ecx, %ecx |
140 | jz L(equal) |
141 | |
142 | mov 12(%rsi), %ecx |
143 | cmp %ecx, 12(%rdi) |
144 | jne L(nequal) |
145 | test %ecx, %ecx |
146 | jz L(equal) |
147 | |
148 | movdqu 16(%rdi), %xmm1 |
149 | movdqu 16(%rsi), %xmm2 |
150 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
151 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
152 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
153 | pmovmskb %xmm1, %edx |
154 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
155 | jnz L(less4_double_words_16) |
156 | |
157 | movdqu 32(%rdi), %xmm1 |
158 | movdqu 32(%rsi), %xmm2 |
159 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
160 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
161 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
162 | pmovmskb %xmm1, %edx |
163 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
164 | jnz L(less4_double_words_32) |
165 | |
166 | mov 48(%rsi), %ecx |
167 | cmp %ecx, 48(%rdi) |
168 | jne L(nequal) |
169 | test %ecx, %ecx |
170 | jz L(equal) |
171 | |
172 | mov 52(%rsi), %ecx |
173 | cmp %ecx, 52(%rdi) |
174 | jne L(nequal) |
175 | test %ecx, %ecx |
176 | jz L(equal) |
177 | |
178 | mov 56(%rsi), %ecx |
179 | cmp %ecx, 56(%rdi) |
180 | jne L(nequal) |
181 | test %ecx, %ecx |
182 | jz L(equal) |
183 | |
184 | mov 60(%rsi), %ecx |
185 | cmp %ecx, 60(%rdi) |
186 | jne L(nequal) |
187 | test %ecx, %ecx |
188 | jz L(equal) |
189 | |
190 | add $64, %rsi |
191 | add $64, %rdi |
192 | jmp L(continue_0_48) |
193 | |
194 | .p2align 4 |
195 | L(continue_00): |
196 | and $15, %ch |
197 | jz L(continue_00_00) |
198 | cmp $16, %eax |
199 | jb L(continue_00_0) |
200 | cmp $32, %eax |
201 | jb L(continue_00_16) |
202 | cmp $48, %eax |
203 | jb L(continue_00_32) |
204 | |
205 | .p2align 4 |
206 | L(continue_00_48): |
207 | pcmpeqd (%rdi), %xmm0 |
208 | mov (%rdi), %eax |
209 | pmovmskb %xmm0, %ecx |
210 | test %ecx, %ecx |
211 | jnz L(less4_double_words1) |
212 | |
213 | cmp (%rsi), %eax |
214 | jne L(nequal) |
215 | |
216 | mov 4(%rdi), %eax |
217 | cmp 4(%rsi), %eax |
218 | jne L(nequal) |
219 | |
220 | mov 8(%rdi), %eax |
221 | cmp 8(%rsi), %eax |
222 | jne L(nequal) |
223 | |
224 | mov 12(%rdi), %eax |
225 | cmp 12(%rsi), %eax |
226 | jne L(nequal) |
227 | |
228 | movdqu 16(%rsi), %xmm2 |
229 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
230 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
231 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
232 | pmovmskb %xmm2, %edx |
233 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
234 | jnz L(less4_double_words_16) |
235 | |
236 | movdqu 32(%rsi), %xmm2 |
237 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
238 | pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
239 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
240 | pmovmskb %xmm2, %edx |
241 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
242 | jnz L(less4_double_words_32) |
243 | |
244 | movdqu 48(%rsi), %xmm2 |
245 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
246 | pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
247 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
248 | pmovmskb %xmm2, %edx |
249 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
250 | jnz L(less4_double_words_48) |
251 | |
252 | add $64, %rsi |
253 | add $64, %rdi |
254 | jmp L(continue_00_48) |
255 | |
256 | .p2align 4 |
257 | L(continue_32): |
258 | and $15, %ch |
259 | jz L(continue_32_00) |
260 | cmp $16, %eax |
261 | jb L(continue_0_32) |
262 | cmp $32, %eax |
263 | jb L(continue_16_32) |
264 | cmp $48, %eax |
265 | jb L(continue_32_32) |
266 | |
267 | .p2align 4 |
268 | L(continue_32_48): |
269 | mov (%rsi), %ecx |
270 | cmp %ecx, (%rdi) |
271 | jne L(nequal) |
272 | test %ecx, %ecx |
273 | jz L(equal) |
274 | |
275 | mov 4(%rsi), %ecx |
276 | cmp %ecx, 4(%rdi) |
277 | jne L(nequal) |
278 | test %ecx, %ecx |
279 | jz L(equal) |
280 | |
281 | mov 8(%rsi), %ecx |
282 | cmp %ecx, 8(%rdi) |
283 | jne L(nequal) |
284 | test %ecx, %ecx |
285 | jz L(equal) |
286 | |
287 | mov 12(%rsi), %ecx |
288 | cmp %ecx, 12(%rdi) |
289 | jne L(nequal) |
290 | test %ecx, %ecx |
291 | jz L(equal) |
292 | |
293 | mov 16(%rsi), %ecx |
294 | cmp %ecx, 16(%rdi) |
295 | jne L(nequal) |
296 | test %ecx, %ecx |
297 | jz L(equal) |
298 | |
299 | mov 20(%rsi), %ecx |
300 | cmp %ecx, 20(%rdi) |
301 | jne L(nequal) |
302 | test %ecx, %ecx |
303 | jz L(equal) |
304 | |
305 | mov 24(%rsi), %ecx |
306 | cmp %ecx, 24(%rdi) |
307 | jne L(nequal) |
308 | test %ecx, %ecx |
309 | jz L(equal) |
310 | |
311 | mov 28(%rsi), %ecx |
312 | cmp %ecx, 28(%rdi) |
313 | jne L(nequal) |
314 | test %ecx, %ecx |
315 | jz L(equal) |
316 | |
317 | movdqu 32(%rdi), %xmm1 |
318 | movdqu 32(%rsi), %xmm2 |
319 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
320 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
321 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
322 | pmovmskb %xmm1, %edx |
323 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
324 | jnz L(less4_double_words_32) |
325 | |
326 | movdqu 48(%rdi), %xmm1 |
327 | movdqu 48(%rsi), %xmm2 |
328 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
329 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
330 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
331 | pmovmskb %xmm1, %edx |
332 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
333 | jnz L(less4_double_words_48) |
334 | |
335 | add $64, %rsi |
336 | add $64, %rdi |
337 | jmp L(continue_32_48) |
338 | |
339 | .p2align 4 |
340 | L(continue_16): |
341 | and $15, %ch |
342 | jz L(continue_16_00) |
343 | cmp $16, %eax |
344 | jb L(continue_0_16) |
345 | cmp $32, %eax |
346 | jb L(continue_16_16) |
347 | cmp $48, %eax |
348 | jb L(continue_16_32) |
349 | |
350 | .p2align 4 |
351 | L(continue_16_48): |
352 | mov (%rsi), %ecx |
353 | cmp %ecx, (%rdi) |
354 | jne L(nequal) |
355 | test %ecx, %ecx |
356 | jz L(equal) |
357 | |
358 | mov 4(%rsi), %ecx |
359 | cmp %ecx, 4(%rdi) |
360 | jne L(nequal) |
361 | test %ecx, %ecx |
362 | jz L(equal) |
363 | |
364 | mov 8(%rsi), %ecx |
365 | cmp %ecx, 8(%rdi) |
366 | jne L(nequal) |
367 | test %ecx, %ecx |
368 | jz L(equal) |
369 | |
370 | mov 12(%rsi), %ecx |
371 | cmp %ecx, 12(%rdi) |
372 | jne L(nequal) |
373 | test %ecx, %ecx |
374 | jz L(equal) |
375 | |
376 | movdqu 16(%rdi), %xmm1 |
377 | movdqu 16(%rsi), %xmm2 |
378 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
379 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
380 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
381 | pmovmskb %xmm1, %edx |
382 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
383 | jnz L(less4_double_words_16) |
384 | |
385 | mov 32(%rsi), %ecx |
386 | cmp %ecx, 32(%rdi) |
387 | jne L(nequal) |
388 | test %ecx, %ecx |
389 | jz L(equal) |
390 | |
391 | mov 36(%rsi), %ecx |
392 | cmp %ecx, 36(%rdi) |
393 | jne L(nequal) |
394 | test %ecx, %ecx |
395 | jz L(equal) |
396 | |
397 | mov 40(%rsi), %ecx |
398 | cmp %ecx, 40(%rdi) |
399 | jne L(nequal) |
400 | test %ecx, %ecx |
401 | jz L(equal) |
402 | |
403 | mov 44(%rsi), %ecx |
404 | cmp %ecx, 44(%rdi) |
405 | jne L(nequal) |
406 | test %ecx, %ecx |
407 | jz L(equal) |
408 | |
409 | movdqu 48(%rdi), %xmm1 |
410 | movdqu 48(%rsi), %xmm2 |
411 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
412 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
413 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
414 | pmovmskb %xmm1, %edx |
415 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
416 | jnz L(less4_double_words_48) |
417 | |
418 | add $64, %rsi |
419 | add $64, %rdi |
420 | jmp L(continue_16_48) |
421 | |
422 | .p2align 4 |
423 | L(continue_00_00): |
424 | movdqa (%rdi), %xmm1 |
425 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
426 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
427 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
428 | pmovmskb %xmm1, %edx |
429 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
430 | jnz L(less4_double_words) |
431 | |
432 | movdqa 16(%rdi), %xmm3 |
433 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
434 | pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */ |
435 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
436 | pmovmskb %xmm3, %edx |
437 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
438 | jnz L(less4_double_words_16) |
439 | |
440 | movdqa 32(%rdi), %xmm5 |
441 | pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ |
442 | pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */ |
443 | psubb %xmm0, %xmm5 /* packed sub of comparison results*/ |
444 | pmovmskb %xmm5, %edx |
445 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
446 | jnz L(less4_double_words_32) |
447 | |
448 | movdqa 48(%rdi), %xmm1 |
449 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
450 | pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
451 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
452 | pmovmskb %xmm1, %edx |
453 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
454 | jnz L(less4_double_words_48) |
455 | |
456 | add $64, %rsi |
457 | add $64, %rdi |
458 | jmp L(continue_00_00) |
459 | |
460 | .p2align 4 |
461 | L(continue_00_32): |
462 | movdqu (%rsi), %xmm2 |
463 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
464 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
465 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
466 | pmovmskb %xmm2, %edx |
467 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
468 | jnz L(less4_double_words) |
469 | |
470 | add $16, %rsi |
471 | add $16, %rdi |
472 | jmp L(continue_00_48) |
473 | |
474 | .p2align 4 |
475 | L(continue_00_16): |
476 | movdqu (%rsi), %xmm2 |
477 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
478 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
479 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
480 | pmovmskb %xmm2, %edx |
481 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
482 | jnz L(less4_double_words) |
483 | |
484 | movdqu 16(%rsi), %xmm2 |
485 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
486 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
487 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
488 | pmovmskb %xmm2, %edx |
489 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
490 | jnz L(less4_double_words_16) |
491 | |
492 | add $32, %rsi |
493 | add $32, %rdi |
494 | jmp L(continue_00_48) |
495 | |
496 | .p2align 4 |
497 | L(continue_00_0): |
498 | movdqu (%rsi), %xmm2 |
499 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
500 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
501 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
502 | pmovmskb %xmm2, %edx |
503 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
504 | jnz L(less4_double_words) |
505 | |
506 | movdqu 16(%rsi), %xmm2 |
507 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
508 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
509 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
510 | pmovmskb %xmm2, %edx |
511 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
512 | jnz L(less4_double_words_16) |
513 | |
514 | movdqu 32(%rsi), %xmm2 |
515 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
516 | pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
517 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
518 | pmovmskb %xmm2, %edx |
519 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
520 | jnz L(less4_double_words_32) |
521 | |
522 | add $48, %rsi |
523 | add $48, %rdi |
524 | jmp L(continue_00_48) |
525 | |
526 | .p2align 4 |
527 | L(continue_48_00): |
528 | pcmpeqd (%rsi), %xmm0 |
529 | mov (%rdi), %eax |
530 | pmovmskb %xmm0, %ecx |
531 | test %ecx, %ecx |
532 | jnz L(less4_double_words1) |
533 | |
534 | cmp (%rsi), %eax |
535 | jne L(nequal) |
536 | |
537 | mov 4(%rdi), %eax |
538 | cmp 4(%rsi), %eax |
539 | jne L(nequal) |
540 | |
541 | mov 8(%rdi), %eax |
542 | cmp 8(%rsi), %eax |
543 | jne L(nequal) |
544 | |
545 | mov 12(%rdi), %eax |
546 | cmp 12(%rsi), %eax |
547 | jne L(nequal) |
548 | |
549 | movdqu 16(%rdi), %xmm1 |
550 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
551 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
552 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
553 | pmovmskb %xmm1, %edx |
554 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
555 | jnz L(less4_double_words_16) |
556 | |
557 | movdqu 32(%rdi), %xmm1 |
558 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
559 | pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
560 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
561 | pmovmskb %xmm1, %edx |
562 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
563 | jnz L(less4_double_words_32) |
564 | |
565 | movdqu 48(%rdi), %xmm1 |
566 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
567 | pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
568 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
569 | pmovmskb %xmm1, %edx |
570 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
571 | jnz L(less4_double_words_48) |
572 | |
573 | add $64, %rsi |
574 | add $64, %rdi |
575 | jmp L(continue_48_00) |
576 | |
577 | .p2align 4 |
578 | L(continue_32_00): |
579 | movdqu (%rdi), %xmm1 |
580 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
581 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
582 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
583 | pmovmskb %xmm1, %edx |
584 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
585 | jnz L(less4_double_words) |
586 | |
587 | add $16, %rsi |
588 | add $16, %rdi |
589 | jmp L(continue_48_00) |
590 | |
591 | .p2align 4 |
592 | L(continue_16_00): |
593 | movdqu (%rdi), %xmm1 |
594 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
595 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
596 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
597 | pmovmskb %xmm1, %edx |
598 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
599 | jnz L(less4_double_words) |
600 | |
601 | movdqu 16(%rdi), %xmm1 |
602 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
603 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
604 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
605 | pmovmskb %xmm1, %edx |
606 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
607 | jnz L(less4_double_words_16) |
608 | |
609 | add $32, %rsi |
610 | add $32, %rdi |
611 | jmp L(continue_48_00) |
612 | |
613 | .p2align 4 |
614 | L(continue_0_00): |
615 | movdqu (%rdi), %xmm1 |
616 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
617 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
618 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
619 | pmovmskb %xmm1, %edx |
620 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
621 | jnz L(less4_double_words) |
622 | |
623 | movdqu 16(%rdi), %xmm1 |
624 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
625 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
626 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
627 | pmovmskb %xmm1, %edx |
628 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
629 | jnz L(less4_double_words_16) |
630 | |
631 | movdqu 32(%rdi), %xmm1 |
632 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
633 | pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
634 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
635 | pmovmskb %xmm1, %edx |
636 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
637 | jnz L(less4_double_words_32) |
638 | |
639 | add $48, %rsi |
640 | add $48, %rdi |
641 | jmp L(continue_48_00) |
642 | |
643 | .p2align 4 |
644 | L(continue_32_32): |
645 | movdqu (%rdi), %xmm1 |
646 | movdqu (%rsi), %xmm2 |
647 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
648 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
649 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
650 | pmovmskb %xmm1, %edx |
651 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
652 | jnz L(less4_double_words) |
653 | |
654 | add $16, %rsi |
655 | add $16, %rdi |
656 | jmp L(continue_48_48) |
657 | |
658 | .p2align 4 |
659 | L(continue_16_16): |
660 | movdqu (%rdi), %xmm1 |
661 | movdqu (%rsi), %xmm2 |
662 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
663 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
664 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
665 | pmovmskb %xmm1, %edx |
666 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
667 | jnz L(less4_double_words) |
668 | |
669 | movdqu 16(%rdi), %xmm3 |
670 | movdqu 16(%rsi), %xmm4 |
671 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
672 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
673 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
674 | pmovmskb %xmm3, %edx |
675 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
676 | jnz L(less4_double_words_16) |
677 | |
678 | add $32, %rsi |
679 | add $32, %rdi |
680 | jmp L(continue_48_48) |
681 | |
682 | .p2align 4 |
683 | L(continue_0_0): |
684 | movdqu (%rdi), %xmm1 |
685 | movdqu (%rsi), %xmm2 |
686 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
687 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
688 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
689 | pmovmskb %xmm1, %edx |
690 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
691 | jnz L(less4_double_words) |
692 | |
693 | movdqu 16(%rdi), %xmm3 |
694 | movdqu 16(%rsi), %xmm4 |
695 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
696 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
697 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
698 | pmovmskb %xmm3, %edx |
699 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
700 | jnz L(less4_double_words_16) |
701 | |
702 | movdqu 32(%rdi), %xmm1 |
703 | movdqu 32(%rsi), %xmm2 |
704 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
705 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
706 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
707 | pmovmskb %xmm1, %edx |
708 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
709 | jnz L(less4_double_words_32) |
710 | |
711 | add $48, %rsi |
712 | add $48, %rdi |
713 | jmp L(continue_48_48) |
714 | |
715 | .p2align 4 |
716 | L(continue_0_16): |
717 | movdqu (%rdi), %xmm1 |
718 | movdqu (%rsi), %xmm2 |
719 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
720 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
721 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
722 | pmovmskb %xmm1, %edx |
723 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
724 | jnz L(less4_double_words) |
725 | |
726 | movdqu 16(%rdi), %xmm1 |
727 | movdqu 16(%rsi), %xmm2 |
728 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
729 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
730 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
731 | pmovmskb %xmm1, %edx |
732 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
733 | jnz L(less4_double_words_16) |
734 | |
735 | add $32, %rsi |
736 | add $32, %rdi |
737 | jmp L(continue_32_48) |
738 | |
739 | .p2align 4 |
740 | L(continue_0_32): |
741 | movdqu (%rdi), %xmm1 |
742 | movdqu (%rsi), %xmm2 |
743 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
744 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
745 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
746 | pmovmskb %xmm1, %edx |
747 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
748 | jnz L(less4_double_words) |
749 | |
750 | add $16, %rsi |
751 | add $16, %rdi |
752 | jmp L(continue_16_48) |
753 | |
754 | .p2align 4 |
755 | L(continue_16_32): |
756 | movdqu (%rdi), %xmm1 |
757 | movdqu (%rsi), %xmm2 |
758 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
759 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
760 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
761 | pmovmskb %xmm1, %edx |
762 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
763 | jnz L(less4_double_words) |
764 | |
765 | add $16, %rsi |
766 | add $16, %rdi |
767 | jmp L(continue_32_48) |
768 | |
769 | .p2align 4 |
770 | L(less4_double_words1): |
771 | cmp (%rsi), %eax |
772 | jne L(nequal) |
773 | test %eax, %eax |
774 | jz L(equal) |
775 | |
776 | mov 4(%rsi), %ecx |
777 | cmp %ecx, 4(%rdi) |
778 | jne L(nequal) |
779 | test %ecx, %ecx |
780 | jz L(equal) |
781 | |
782 | mov 8(%rsi), %ecx |
783 | cmp %ecx, 8(%rdi) |
784 | jne L(nequal) |
785 | test %ecx, %ecx |
786 | jz L(equal) |
787 | |
788 | mov 12(%rsi), %ecx |
789 | cmp %ecx, 12(%rdi) |
790 | jne L(nequal) |
791 | xor %eax, %eax |
792 | ret |
793 | |
794 | .p2align 4 |
795 | L(less4_double_words): |
796 | xor %eax, %eax |
797 | test %dl, %dl |
798 | jz L(next_two_double_words) |
799 | and $15, %dl |
800 | jz L(second_double_word) |
801 | mov (%rdi), %eax |
802 | cmp (%rsi), %eax |
803 | jne L(nequal) |
804 | ret |
805 | |
806 | .p2align 4 |
807 | L(second_double_word): |
808 | mov 4(%rdi), %eax |
809 | cmp 4(%rsi), %eax |
810 | jne L(nequal) |
811 | ret |
812 | |
813 | .p2align 4 |
814 | L(next_two_double_words): |
815 | and $15, %dh |
816 | jz L(fourth_double_word) |
817 | mov 8(%rdi), %eax |
818 | cmp 8(%rsi), %eax |
819 | jne L(nequal) |
820 | ret |
821 | |
822 | .p2align 4 |
823 | L(fourth_double_word): |
824 | mov 12(%rdi), %eax |
825 | cmp 12(%rsi), %eax |
826 | jne L(nequal) |
827 | ret |
828 | |
829 | .p2align 4 |
830 | L(less4_double_words_16): |
831 | xor %eax, %eax |
832 | test %dl, %dl |
833 | jz L(next_two_double_words_16) |
834 | and $15, %dl |
835 | jz L(second_double_word_16) |
836 | mov 16(%rdi), %eax |
837 | cmp 16(%rsi), %eax |
838 | jne L(nequal) |
839 | ret |
840 | |
841 | .p2align 4 |
842 | L(second_double_word_16): |
843 | mov 20(%rdi), %eax |
844 | cmp 20(%rsi), %eax |
845 | jne L(nequal) |
846 | ret |
847 | |
848 | .p2align 4 |
849 | L(next_two_double_words_16): |
850 | and $15, %dh |
851 | jz L(fourth_double_word_16) |
852 | mov 24(%rdi), %eax |
853 | cmp 24(%rsi), %eax |
854 | jne L(nequal) |
855 | ret |
856 | |
857 | .p2align 4 |
858 | L(fourth_double_word_16): |
859 | mov 28(%rdi), %eax |
860 | cmp 28(%rsi), %eax |
861 | jne L(nequal) |
862 | ret |
863 | |
864 | .p2align 4 |
865 | L(less4_double_words_32): |
866 | xor %eax, %eax |
867 | test %dl, %dl |
868 | jz L(next_two_double_words_32) |
869 | and $15, %dl |
870 | jz L(second_double_word_32) |
871 | mov 32(%rdi), %eax |
872 | cmp 32(%rsi), %eax |
873 | jne L(nequal) |
874 | ret |
875 | |
876 | .p2align 4 |
877 | L(second_double_word_32): |
878 | mov 36(%rdi), %eax |
879 | cmp 36(%rsi), %eax |
880 | jne L(nequal) |
881 | ret |
882 | |
883 | .p2align 4 |
884 | L(next_two_double_words_32): |
885 | and $15, %dh |
886 | jz L(fourth_double_word_32) |
887 | mov 40(%rdi), %eax |
888 | cmp 40(%rsi), %eax |
889 | jne L(nequal) |
890 | ret |
891 | |
892 | .p2align 4 |
893 | L(fourth_double_word_32): |
894 | mov 44(%rdi), %eax |
895 | cmp 44(%rsi), %eax |
896 | jne L(nequal) |
897 | ret |
898 | |
899 | .p2align 4 |
900 | L(less4_double_words_48): |
901 | xor %eax, %eax |
902 | test %dl, %dl |
903 | jz L(next_two_double_words_48) |
904 | and $15, %dl |
905 | jz L(second_double_word_48) |
906 | mov 48(%rdi), %eax |
907 | cmp 48(%rsi), %eax |
908 | jne L(nequal) |
909 | ret |
910 | |
911 | .p2align 4 |
912 | L(second_double_word_48): |
913 | mov 52(%rdi), %eax |
914 | cmp 52(%rsi), %eax |
915 | jne L(nequal) |
916 | ret |
917 | |
918 | .p2align 4 |
919 | L(next_two_double_words_48): |
920 | and $15, %dh |
921 | jz L(fourth_double_word_48) |
922 | mov 56(%rdi), %eax |
923 | cmp 56(%rsi), %eax |
924 | jne L(nequal) |
925 | ret |
926 | |
927 | .p2align 4 |
928 | L(fourth_double_word_48): |
929 | mov 60(%rdi), %eax |
930 | cmp 60(%rsi), %eax |
931 | jne L(nequal) |
932 | ret |
933 | |
934 | .p2align 4 |
935 | L(nequal): |
936 | mov $1, %eax |
937 | jg L(nequal_bigger) |
938 | neg %eax |
939 | |
940 | L(nequal_bigger): |
941 | ret |
942 | |
943 | .p2align 4 |
944 | L(equal): |
945 | xor %rax, %rax |
946 | ret |
947 | |
948 | END (__wcscmp) |
949 | #ifndef __wcscmp |
950 | libc_hidden_def (__wcscmp) |
951 | weak_alias (__wcscmp, wcscmp) |
952 | #endif |
953 | |