1 | /* strcmp with SSE4.2 |
2 | Copyright (C) 2009-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifndef STRCMP_SSE42 |
22 | # define STRCMP_SSE42 __strcmp_sse42 |
23 | #endif |
24 | |
25 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
26 | # include "locale-defines.h" |
27 | #endif |
28 | |
29 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
30 | /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz |
31 | if the new counter > the old one or is 0. */ |
32 | # define UPDATE_STRNCMP_COUNTER \ |
33 | /* calculate left number to compare */ \ |
34 | lea -16(%rcx, %r11), %r9; \ |
35 | cmp %r9, %r11; \ |
36 | jb LABEL(strcmp_exitz); \ |
37 | test %r9, %r9; \ |
38 | je LABEL(strcmp_exitz); \ |
39 | mov %r9, %r11 |
40 | #else |
41 | # define UPDATE_STRNCMP_COUNTER |
42 | #endif |
43 | |
44 | #ifdef USE_AVX |
45 | # define SECTION avx |
46 | # define GLABEL(l) l##_avx |
47 | #else |
48 | # define SECTION sse4.2 |
49 | # define GLABEL(l) l##_sse42 |
50 | #endif |
51 | |
52 | #define LABEL(l) .L##l |
53 | |
54 | /* We use 0x1a: |
55 | _SIDD_SBYTE_OPS |
56 | | _SIDD_CMP_EQUAL_EACH |
57 | | _SIDD_NEGATIVE_POLARITY |
58 | | _SIDD_LEAST_SIGNIFICANT |
59 | on pcmpistri to find out if two 16byte data elements are the same |
60 | and the offset of the first different byte. There are 4 cases: |
61 | |
62 | 1. Both 16byte data elements are valid and identical. |
63 | 2. Both 16byte data elements have EOS and identical. |
64 | 3. Both 16byte data elements are valid and they differ at offset X. |
65 | 4. At least one 16byte data element has EOS at offset X. Two 16byte |
66 | data elements must differ at or before offset X. |
67 | |
68 | Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: |
69 | |
70 | case ECX CFlag ZFlag SFlag |
71 | 1 16 0 0 0 |
72 | 2 16 0 1 1 |
73 | 3 X 1 0 0 |
74 | 4 0 <= X 1 0/1 0/1 |
75 | |
76 | We exit from the loop for cases 2, 3 and 4 with jbe which branches |
77 | when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for |
78 | case 2. */ |
79 | |
80 | /* Put all SSE 4.2 functions together. */ |
81 | .section .text.SECTION,"ax" ,@progbits |
82 | .align 16 |
83 | .type STRCMP_SSE42, @function |
84 | .globl STRCMP_SSE42 |
85 | .hidden STRCMP_SSE42 |
86 | #ifdef USE_AS_STRCASECMP_L |
87 | ENTRY (GLABEL(__strcasecmp)) |
88 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
89 | mov %fs:(%rax),%RDX_LP |
90 | |
91 | // XXX 5 byte should be before the function |
92 | /* 5-byte NOP. */ |
93 | .byte 0x0f,0x1f,0x44,0x00,0x00 |
94 | END (GLABEL(__strcasecmp)) |
95 | /* FALLTHROUGH to strcasecmp_l. */ |
96 | #endif |
97 | #ifdef USE_AS_STRNCASECMP_L |
98 | ENTRY (GLABEL(__strncasecmp)) |
99 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
100 | mov %fs:(%rax),%RCX_LP |
101 | |
102 | // XXX 5 byte should be before the function |
103 | /* 5-byte NOP. */ |
104 | .byte 0x0f,0x1f,0x44,0x00,0x00 |
105 | END (GLABEL(__strncasecmp)) |
106 | /* FALLTHROUGH to strncasecmp_l. */ |
107 | #endif |
108 | |
109 | |
110 | #ifdef USE_AVX |
111 | # define movdqa vmovdqa |
112 | # define movdqu vmovdqu |
113 | # define pmovmskb vpmovmskb |
114 | # define pcmpistri vpcmpistri |
115 | # define psubb vpsubb |
116 | # define pcmpeqb vpcmpeqb |
117 | # define psrldq vpsrldq |
118 | # define pslldq vpslldq |
119 | # define palignr vpalignr |
120 | # define pxor vpxor |
121 | # define D(arg) arg, arg |
122 | #else |
123 | # define D(arg) arg |
124 | #endif |
125 | |
126 | STRCMP_SSE42: |
127 | cfi_startproc |
128 | _CET_ENDBR |
129 | CALL_MCOUNT |
130 | |
131 | /* |
132 | * This implementation uses SSE to compare up to 16 bytes at a time. |
133 | */ |
134 | #ifdef USE_AS_STRCASECMP_L |
135 | /* We have to fall back on the C implementation for locales |
136 | with encodings not matching ASCII for single bytes. */ |
137 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
138 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP |
139 | # else |
140 | mov (%rdx), %RAX_LP |
141 | # endif |
142 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
143 | jne __strcasecmp_l_nonascii |
144 | #endif |
145 | #ifdef USE_AS_STRNCASECMP_L |
146 | /* We have to fall back on the C implementation for locales |
147 | with encodings not matching ASCII for single bytes. */ |
148 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
149 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP |
150 | # else |
151 | mov (%rcx), %RAX_LP |
152 | # endif |
153 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
154 | jne __strncasecmp_l_nonascii |
155 | #endif |
156 | |
157 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
158 | test %RDX_LP, %RDX_LP |
159 | je LABEL(strcmp_exitz) |
160 | cmp $1, %RDX_LP |
161 | je LABEL(Byte0) |
162 | mov %RDX_LP, %R11_LP |
163 | #endif |
164 | mov %esi, %ecx |
165 | mov %edi, %eax |
166 | /* Use 64bit AND here to avoid long NOP padding. */ |
167 | and $0x3f, %rcx /* rsi alignment in cache line */ |
168 | and $0x3f, %rax /* rdi alignment in cache line */ |
169 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
170 | .section .rodata.cst16,"aM" ,@progbits,16 |
171 | .align 16 |
172 | LABEL(belowupper): |
173 | .quad 0x4040404040404040 |
174 | .quad 0x4040404040404040 |
175 | LABEL(topupper): |
176 | # ifdef USE_AVX |
177 | .quad 0x5a5a5a5a5a5a5a5a |
178 | .quad 0x5a5a5a5a5a5a5a5a |
179 | # else |
180 | .quad 0x5b5b5b5b5b5b5b5b |
181 | .quad 0x5b5b5b5b5b5b5b5b |
182 | # endif |
183 | LABEL(touppermask): |
184 | .quad 0x2020202020202020 |
185 | .quad 0x2020202020202020 |
186 | .previous |
187 | movdqa LABEL(belowupper)(%rip), %xmm4 |
188 | # define UCLOW_reg %xmm4 |
189 | movdqa LABEL(topupper)(%rip), %xmm5 |
190 | # define UCHIGH_reg %xmm5 |
191 | movdqa LABEL(touppermask)(%rip), %xmm6 |
192 | # define LCQWORD_reg %xmm6 |
193 | #endif |
194 | cmp $0x30, %ecx |
195 | ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ |
196 | cmp $0x30, %eax |
197 | ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ |
198 | movdqu (%rdi), %xmm1 |
199 | movdqu (%rsi), %xmm2 |
200 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
201 | # ifdef USE_AVX |
202 | # define TOLOWER(reg1, reg2) \ |
203 | vpcmpgtb UCLOW_reg, reg1, %xmm7; \ |
204 | vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ |
205 | vpcmpgtb UCLOW_reg, reg2, %xmm9; \ |
206 | vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ |
207 | vpandn %xmm7, %xmm8, %xmm8; \ |
208 | vpandn %xmm9, %xmm10, %xmm10; \ |
209 | vpand LCQWORD_reg, %xmm8, %xmm8; \ |
210 | vpand LCQWORD_reg, %xmm10, %xmm10; \ |
211 | vpor reg1, %xmm8, reg1; \ |
212 | vpor reg2, %xmm10, reg2 |
213 | # else |
214 | # define TOLOWER(reg1, reg2) \ |
215 | movdqa reg1, %xmm7; \ |
216 | movdqa UCHIGH_reg, %xmm8; \ |
217 | movdqa reg2, %xmm9; \ |
218 | movdqa UCHIGH_reg, %xmm10; \ |
219 | pcmpgtb UCLOW_reg, %xmm7; \ |
220 | pcmpgtb reg1, %xmm8; \ |
221 | pcmpgtb UCLOW_reg, %xmm9; \ |
222 | pcmpgtb reg2, %xmm10; \ |
223 | pand %xmm8, %xmm7; \ |
224 | pand %xmm10, %xmm9; \ |
225 | pand LCQWORD_reg, %xmm7; \ |
226 | pand LCQWORD_reg, %xmm9; \ |
227 | por %xmm7, reg1; \ |
228 | por %xmm9, reg2 |
229 | # endif |
230 | TOLOWER (%xmm1, %xmm2) |
231 | #else |
232 | # define TOLOWER(reg1, reg2) |
233 | #endif |
234 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ |
235 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
236 | pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ |
237 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
238 | pmovmskb %xmm1, %edx |
239 | sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ |
240 | jnz LABEL(less16bytes)/* If not, find different value or null char */ |
241 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
242 | sub $16, %r11 |
243 | jbe LABEL(strcmp_exitz)/* finish comparison */ |
244 | #endif |
245 | add $16, %rsi /* prepare to search next 16 bytes */ |
246 | add $16, %rdi /* prepare to search next 16 bytes */ |
247 | |
248 | /* |
249 | * Determine source and destination string offsets from 16-byte |
250 | * alignment. Use relative offset difference between the two to |
251 | * determine which case below to use. |
252 | */ |
253 | .p2align 4 |
254 | LABEL(crosscache): |
255 | and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ |
256 | and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ |
257 | mov $0xffff, %edx /* for equivalent offset */ |
258 | xor %r8d, %r8d |
259 | and $0xf, %ecx /* offset of rsi */ |
260 | and $0xf, %eax /* offset of rdi */ |
261 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ |
262 | cmp %eax, %ecx |
263 | je LABEL(ashr_0) /* rsi and rdi relative offset same */ |
264 | ja LABEL(bigger) |
265 | mov %edx, %r8d /* r8d is offset flag for exit tail */ |
266 | xchg %ecx, %eax |
267 | xchg %rsi, %rdi |
268 | LABEL(bigger): |
269 | movdqa (%rdi), %xmm2 |
270 | movdqa (%rsi), %xmm1 |
271 | lea 15(%rax), %r9 |
272 | sub %rcx, %r9 |
273 | lea LABEL(unaligned_table)(%rip), %r10 |
274 | movslq (%r10, %r9,4), %r9 |
275 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
276 | lea (%r10, %r9), %r10 |
277 | _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ |
278 | |
279 | /* |
280 | * The following cases will be handled by ashr_0 |
281 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
282 | * n(0~15) n(0~15) 15(15+ n-n) ashr_0 |
283 | */ |
284 | .p2align 4 |
285 | LABEL(ashr_0): |
286 | |
287 | movdqa (%rsi), %xmm1 |
288 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
289 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
290 | pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ |
291 | #else |
292 | movdqa (%rdi), %xmm2 |
293 | TOLOWER (%xmm1, %xmm2) |
294 | pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ |
295 | #endif |
296 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
297 | pmovmskb %xmm1, %r9d |
298 | shr %cl, %edx /* adjust 0xffff for offset */ |
299 | shr %cl, %r9d /* adjust for 16-byte offset */ |
300 | sub %r9d, %edx |
301 | /* |
302 | * edx must be the same with r9d if in left byte (16-rcx) is equal to |
303 | * the start from (16-rax) and no null char was seen. |
304 | */ |
305 | jne LABEL(less32bytes) /* mismatch or null char */ |
306 | UPDATE_STRNCMP_COUNTER |
307 | mov $16, %rcx |
308 | mov $16, %r9 |
309 | |
310 | /* |
311 | * Now both strings are aligned at 16-byte boundary. Loop over strings |
312 | * checking 32-bytes per iteration. |
313 | */ |
314 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
315 | .p2align 4 |
316 | LABEL(ashr_0_use): |
317 | movdqa (%rdi,%rdx), %xmm0 |
318 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
319 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
320 | #else |
321 | movdqa (%rsi,%rdx), %xmm1 |
322 | TOLOWER (%xmm0, %xmm1) |
323 | pcmpistri $0x1a, %xmm1, %xmm0 |
324 | #endif |
325 | lea 16(%rdx), %rdx |
326 | jbe LABEL(ashr_0_exit_use) |
327 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
328 | sub $16, %r11 |
329 | jbe LABEL(strcmp_exitz) |
330 | #endif |
331 | |
332 | movdqa (%rdi,%rdx), %xmm0 |
333 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
334 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
335 | #else |
336 | movdqa (%rsi,%rdx), %xmm1 |
337 | TOLOWER (%xmm0, %xmm1) |
338 | pcmpistri $0x1a, %xmm1, %xmm0 |
339 | #endif |
340 | lea 16(%rdx), %rdx |
341 | jbe LABEL(ashr_0_exit_use) |
342 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
343 | sub $16, %r11 |
344 | jbe LABEL(strcmp_exitz) |
345 | #endif |
346 | jmp LABEL(ashr_0_use) |
347 | |
348 | |
349 | .p2align 4 |
350 | LABEL(ashr_0_exit_use): |
351 | jnc LABEL(strcmp_exitz) |
352 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
353 | sub %rcx, %r11 |
354 | jbe LABEL(strcmp_exitz) |
355 | #endif |
356 | lea -16(%rdx, %rcx), %rcx |
357 | movzbl (%rdi, %rcx), %eax |
358 | movzbl (%rsi, %rcx), %edx |
359 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
360 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
361 | movl (%rcx,%rax,4), %eax |
362 | movl (%rcx,%rdx,4), %edx |
363 | #endif |
364 | sub %edx, %eax |
365 | ret |
366 | |
367 | |
368 | |
369 | /* |
370 | * The following cases will be handled by ashr_1 |
371 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
372 | * n(15) n -15 0(15 +(n-15) - n) ashr_1 |
373 | */ |
374 | .p2align 4 |
375 | LABEL(ashr_1): |
376 | pslldq $15, D(%xmm2) /* shift first string to align with second */ |
377 | TOLOWER (%xmm1, %xmm2) |
378 | pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ |
379 | psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ |
380 | pmovmskb %xmm2, %r9d |
381 | shr %cl, %edx /* adjust 0xffff for offset */ |
382 | shr %cl, %r9d /* adjust for 16-byte offset */ |
383 | sub %r9d, %edx |
384 | jnz LABEL(less32bytes) /* mismatch or null char seen */ |
385 | movdqa (%rdi), %xmm3 |
386 | UPDATE_STRNCMP_COUNTER |
387 | |
388 | mov $16, %rcx /* index for loads*/ |
389 | mov $1, %r9d /* byte position left over from less32bytes case */ |
390 | /* |
391 | * Setup %r10 value allows us to detect crossing a page boundary. |
392 | * When %r10 goes positive we have crossed a page boundary and |
393 | * need to do a nibble. |
394 | */ |
395 | lea 1(%rdi), %r10 |
396 | and $0xfff, %r10 /* offset into 4K page */ |
397 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
398 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
399 | |
400 | .p2align 4 |
401 | LABEL(loop_ashr_1_use): |
402 | add $16, %r10 |
403 | jg LABEL(nibble_ashr_1_use) |
404 | |
405 | LABEL(nibble_ashr_1_restart_use): |
406 | movdqa (%rdi, %rdx), %xmm0 |
407 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
408 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
409 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
410 | #else |
411 | movdqa (%rsi,%rdx), %xmm1 |
412 | TOLOWER (%xmm0, %xmm1) |
413 | pcmpistri $0x1a, %xmm1, %xmm0 |
414 | #endif |
415 | jbe LABEL(exit_use) |
416 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
417 | sub $16, %r11 |
418 | jbe LABEL(strcmp_exitz) |
419 | #endif |
420 | |
421 | add $16, %rdx |
422 | add $16, %r10 |
423 | jg LABEL(nibble_ashr_1_use) |
424 | |
425 | movdqa (%rdi, %rdx), %xmm0 |
426 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
427 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
428 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
429 | #else |
430 | movdqa (%rsi,%rdx), %xmm1 |
431 | TOLOWER (%xmm0, %xmm1) |
432 | pcmpistri $0x1a, %xmm1, %xmm0 |
433 | #endif |
434 | jbe LABEL(exit_use) |
435 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
436 | sub $16, %r11 |
437 | jbe LABEL(strcmp_exitz) |
438 | #endif |
439 | add $16, %rdx |
440 | jmp LABEL(loop_ashr_1_use) |
441 | |
442 | .p2align 4 |
443 | LABEL(nibble_ashr_1_use): |
444 | sub $0x1000, %r10 |
445 | movdqa -16(%rdi, %rdx), %xmm0 |
446 | psrldq $1, D(%xmm0) |
447 | pcmpistri $0x3a,%xmm0, %xmm0 |
448 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
449 | cmp %r11, %rcx |
450 | jae LABEL(nibble_ashr_exit_use) |
451 | #endif |
452 | cmp $14, %ecx |
453 | ja LABEL(nibble_ashr_1_restart_use) |
454 | |
455 | jmp LABEL(nibble_ashr_exit_use) |
456 | |
457 | /* |
458 | * The following cases will be handled by ashr_2 |
459 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
460 | * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 |
461 | */ |
462 | .p2align 4 |
463 | LABEL(ashr_2): |
464 | pslldq $14, D(%xmm2) |
465 | TOLOWER (%xmm1, %xmm2) |
466 | pcmpeqb %xmm1, D(%xmm2) |
467 | psubb %xmm0, D(%xmm2) |
468 | pmovmskb %xmm2, %r9d |
469 | shr %cl, %edx |
470 | shr %cl, %r9d |
471 | sub %r9d, %edx |
472 | jnz LABEL(less32bytes) |
473 | movdqa (%rdi), %xmm3 |
474 | UPDATE_STRNCMP_COUNTER |
475 | |
476 | mov $16, %rcx /* index for loads */ |
477 | mov $2, %r9d /* byte position left over from less32bytes case */ |
478 | /* |
479 | * Setup %r10 value allows us to detect crossing a page boundary. |
480 | * When %r10 goes positive we have crossed a page boundary and |
481 | * need to do a nibble. |
482 | */ |
483 | lea 2(%rdi), %r10 |
484 | and $0xfff, %r10 /* offset into 4K page */ |
485 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
486 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
487 | |
488 | .p2align 4 |
489 | LABEL(loop_ashr_2_use): |
490 | add $16, %r10 |
491 | jg LABEL(nibble_ashr_2_use) |
492 | |
493 | LABEL(nibble_ashr_2_restart_use): |
494 | movdqa (%rdi, %rdx), %xmm0 |
495 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
496 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
497 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
498 | #else |
499 | movdqa (%rsi,%rdx), %xmm1 |
500 | TOLOWER (%xmm0, %xmm1) |
501 | pcmpistri $0x1a, %xmm1, %xmm0 |
502 | #endif |
503 | jbe LABEL(exit_use) |
504 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
505 | sub $16, %r11 |
506 | jbe LABEL(strcmp_exitz) |
507 | #endif |
508 | |
509 | add $16, %rdx |
510 | add $16, %r10 |
511 | jg LABEL(nibble_ashr_2_use) |
512 | |
513 | movdqa (%rdi, %rdx), %xmm0 |
514 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
515 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
516 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
517 | #else |
518 | movdqa (%rsi,%rdx), %xmm1 |
519 | TOLOWER (%xmm0, %xmm1) |
520 | pcmpistri $0x1a, %xmm1, %xmm0 |
521 | #endif |
522 | jbe LABEL(exit_use) |
523 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
524 | sub $16, %r11 |
525 | jbe LABEL(strcmp_exitz) |
526 | #endif |
527 | add $16, %rdx |
528 | jmp LABEL(loop_ashr_2_use) |
529 | |
530 | .p2align 4 |
531 | LABEL(nibble_ashr_2_use): |
532 | sub $0x1000, %r10 |
533 | movdqa -16(%rdi, %rdx), %xmm0 |
534 | psrldq $2, D(%xmm0) |
535 | pcmpistri $0x3a,%xmm0, %xmm0 |
536 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
537 | cmp %r11, %rcx |
538 | jae LABEL(nibble_ashr_exit_use) |
539 | #endif |
540 | cmp $13, %ecx |
541 | ja LABEL(nibble_ashr_2_restart_use) |
542 | |
543 | jmp LABEL(nibble_ashr_exit_use) |
544 | |
545 | /* |
546 | * The following cases will be handled by ashr_3 |
547 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
548 | * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 |
549 | */ |
550 | .p2align 4 |
551 | LABEL(ashr_3): |
552 | pslldq $13, D(%xmm2) |
553 | TOLOWER (%xmm1, %xmm2) |
554 | pcmpeqb %xmm1, D(%xmm2) |
555 | psubb %xmm0, D(%xmm2) |
556 | pmovmskb %xmm2, %r9d |
557 | shr %cl, %edx |
558 | shr %cl, %r9d |
559 | sub %r9d, %edx |
560 | jnz LABEL(less32bytes) |
561 | movdqa (%rdi), %xmm3 |
562 | |
563 | UPDATE_STRNCMP_COUNTER |
564 | |
565 | mov $16, %rcx /* index for loads */ |
566 | mov $3, %r9d /* byte position left over from less32bytes case */ |
567 | /* |
568 | * Setup %r10 value allows us to detect crossing a page boundary. |
569 | * When %r10 goes positive we have crossed a page boundary and |
570 | * need to do a nibble. |
571 | */ |
572 | lea 3(%rdi), %r10 |
573 | and $0xfff, %r10 /* offset into 4K page */ |
574 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
575 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
576 | |
577 | LABEL(loop_ashr_3_use): |
578 | add $16, %r10 |
579 | jg LABEL(nibble_ashr_3_use) |
580 | |
581 | LABEL(nibble_ashr_3_restart_use): |
582 | movdqa (%rdi, %rdx), %xmm0 |
583 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
584 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
585 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
586 | #else |
587 | movdqa (%rsi,%rdx), %xmm1 |
588 | TOLOWER (%xmm0, %xmm1) |
589 | pcmpistri $0x1a, %xmm1, %xmm0 |
590 | #endif |
591 | jbe LABEL(exit_use) |
592 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
593 | sub $16, %r11 |
594 | jbe LABEL(strcmp_exitz) |
595 | #endif |
596 | |
597 | add $16, %rdx |
598 | add $16, %r10 |
599 | jg LABEL(nibble_ashr_3_use) |
600 | |
601 | movdqa (%rdi, %rdx), %xmm0 |
602 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
603 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
604 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
605 | #else |
606 | movdqa (%rsi,%rdx), %xmm1 |
607 | TOLOWER (%xmm0, %xmm1) |
608 | pcmpistri $0x1a, %xmm1, %xmm0 |
609 | #endif |
610 | jbe LABEL(exit_use) |
611 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
612 | sub $16, %r11 |
613 | jbe LABEL(strcmp_exitz) |
614 | #endif |
615 | add $16, %rdx |
616 | jmp LABEL(loop_ashr_3_use) |
617 | |
618 | .p2align 4 |
619 | LABEL(nibble_ashr_3_use): |
620 | sub $0x1000, %r10 |
621 | movdqa -16(%rdi, %rdx), %xmm0 |
622 | psrldq $3, D(%xmm0) |
623 | pcmpistri $0x3a,%xmm0, %xmm0 |
624 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
625 | cmp %r11, %rcx |
626 | jae LABEL(nibble_ashr_exit_use) |
627 | #endif |
628 | cmp $12, %ecx |
629 | ja LABEL(nibble_ashr_3_restart_use) |
630 | |
631 | jmp LABEL(nibble_ashr_exit_use) |
632 | |
633 | /* |
634 | * The following cases will be handled by ashr_4 |
635 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
636 | * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 |
637 | */ |
638 | .p2align 4 |
639 | LABEL(ashr_4): |
640 | pslldq $12, D(%xmm2) |
641 | TOLOWER (%xmm1, %xmm2) |
642 | pcmpeqb %xmm1, D(%xmm2) |
643 | psubb %xmm0, D(%xmm2) |
644 | pmovmskb %xmm2, %r9d |
645 | shr %cl, %edx |
646 | shr %cl, %r9d |
647 | sub %r9d, %edx |
648 | jnz LABEL(less32bytes) |
649 | movdqa (%rdi), %xmm3 |
650 | |
651 | UPDATE_STRNCMP_COUNTER |
652 | |
653 | mov $16, %rcx /* index for loads */ |
654 | mov $4, %r9d /* byte position left over from less32bytes case */ |
655 | /* |
656 | * Setup %r10 value allows us to detect crossing a page boundary. |
657 | * When %r10 goes positive we have crossed a page boundary and |
658 | * need to do a nibble. |
659 | */ |
660 | lea 4(%rdi), %r10 |
661 | and $0xfff, %r10 /* offset into 4K page */ |
662 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
663 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
664 | |
665 | .p2align 4 |
666 | LABEL(loop_ashr_4_use): |
667 | add $16, %r10 |
668 | jg LABEL(nibble_ashr_4_use) |
669 | |
670 | LABEL(nibble_ashr_4_restart_use): |
671 | movdqa (%rdi, %rdx), %xmm0 |
672 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
673 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
674 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
675 | #else |
676 | movdqa (%rsi,%rdx), %xmm1 |
677 | TOLOWER (%xmm0, %xmm1) |
678 | pcmpistri $0x1a, %xmm1, %xmm0 |
679 | #endif |
680 | jbe LABEL(exit_use) |
681 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
682 | sub $16, %r11 |
683 | jbe LABEL(strcmp_exitz) |
684 | #endif |
685 | |
686 | add $16, %rdx |
687 | add $16, %r10 |
688 | jg LABEL(nibble_ashr_4_use) |
689 | |
690 | movdqa (%rdi, %rdx), %xmm0 |
691 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
692 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
693 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
694 | #else |
695 | movdqa (%rsi,%rdx), %xmm1 |
696 | TOLOWER (%xmm0, %xmm1) |
697 | pcmpistri $0x1a, %xmm1, %xmm0 |
698 | #endif |
699 | jbe LABEL(exit_use) |
700 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
701 | sub $16, %r11 |
702 | jbe LABEL(strcmp_exitz) |
703 | #endif |
704 | add $16, %rdx |
705 | jmp LABEL(loop_ashr_4_use) |
706 | |
707 | .p2align 4 |
708 | LABEL(nibble_ashr_4_use): |
709 | sub $0x1000, %r10 |
710 | movdqa -16(%rdi, %rdx), %xmm0 |
711 | psrldq $4, D(%xmm0) |
712 | pcmpistri $0x3a,%xmm0, %xmm0 |
713 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
714 | cmp %r11, %rcx |
715 | jae LABEL(nibble_ashr_exit_use) |
716 | #endif |
717 | cmp $11, %ecx |
718 | ja LABEL(nibble_ashr_4_restart_use) |
719 | |
720 | jmp LABEL(nibble_ashr_exit_use) |
721 | |
722 | /* |
723 | * The following cases will be handled by ashr_5 |
724 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
725 | * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 |
726 | */ |
727 | .p2align 4 |
728 | LABEL(ashr_5): |
729 | pslldq $11, D(%xmm2) |
730 | TOLOWER (%xmm1, %xmm2) |
731 | pcmpeqb %xmm1, D(%xmm2) |
732 | psubb %xmm0, D(%xmm2) |
733 | pmovmskb %xmm2, %r9d |
734 | shr %cl, %edx |
735 | shr %cl, %r9d |
736 | sub %r9d, %edx |
737 | jnz LABEL(less32bytes) |
738 | movdqa (%rdi), %xmm3 |
739 | |
740 | UPDATE_STRNCMP_COUNTER |
741 | |
742 | mov $16, %rcx /* index for loads */ |
743 | mov $5, %r9d /* byte position left over from less32bytes case */ |
744 | /* |
745 | * Setup %r10 value allows us to detect crossing a page boundary. |
746 | * When %r10 goes positive we have crossed a page boundary and |
747 | * need to do a nibble. |
748 | */ |
749 | lea 5(%rdi), %r10 |
750 | and $0xfff, %r10 /* offset into 4K page */ |
751 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
752 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
753 | |
754 | .p2align 4 |
755 | LABEL(loop_ashr_5_use): |
756 | add $16, %r10 |
757 | jg LABEL(nibble_ashr_5_use) |
758 | |
759 | LABEL(nibble_ashr_5_restart_use): |
760 | movdqa (%rdi, %rdx), %xmm0 |
761 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
762 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
763 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
764 | #else |
765 | movdqa (%rsi,%rdx), %xmm1 |
766 | TOLOWER (%xmm0, %xmm1) |
767 | pcmpistri $0x1a, %xmm1, %xmm0 |
768 | #endif |
769 | jbe LABEL(exit_use) |
770 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
771 | sub $16, %r11 |
772 | jbe LABEL(strcmp_exitz) |
773 | #endif |
774 | |
775 | add $16, %rdx |
776 | add $16, %r10 |
777 | jg LABEL(nibble_ashr_5_use) |
778 | |
779 | movdqa (%rdi, %rdx), %xmm0 |
780 | |
781 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
782 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
783 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
784 | #else |
785 | movdqa (%rsi,%rdx), %xmm1 |
786 | TOLOWER (%xmm0, %xmm1) |
787 | pcmpistri $0x1a, %xmm1, %xmm0 |
788 | #endif |
789 | jbe LABEL(exit_use) |
790 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
791 | sub $16, %r11 |
792 | jbe LABEL(strcmp_exitz) |
793 | #endif |
794 | add $16, %rdx |
795 | jmp LABEL(loop_ashr_5_use) |
796 | |
797 | .p2align 4 |
798 | LABEL(nibble_ashr_5_use): |
799 | sub $0x1000, %r10 |
800 | movdqa -16(%rdi, %rdx), %xmm0 |
801 | psrldq $5, D(%xmm0) |
802 | pcmpistri $0x3a,%xmm0, %xmm0 |
803 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
804 | cmp %r11, %rcx |
805 | jae LABEL(nibble_ashr_exit_use) |
806 | #endif |
807 | cmp $10, %ecx |
808 | ja LABEL(nibble_ashr_5_restart_use) |
809 | |
810 | jmp LABEL(nibble_ashr_exit_use) |
811 | |
812 | /* |
813 | * The following cases will be handled by ashr_6 |
814 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
815 | * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 |
816 | */ |
817 | .p2align 4 |
818 | LABEL(ashr_6): |
819 | pslldq $10, D(%xmm2) |
820 | TOLOWER (%xmm1, %xmm2) |
821 | pcmpeqb %xmm1, D(%xmm2) |
822 | psubb %xmm0, D(%xmm2) |
823 | pmovmskb %xmm2, %r9d |
824 | shr %cl, %edx |
825 | shr %cl, %r9d |
826 | sub %r9d, %edx |
827 | jnz LABEL(less32bytes) |
828 | movdqa (%rdi), %xmm3 |
829 | |
830 | UPDATE_STRNCMP_COUNTER |
831 | |
832 | mov $16, %rcx /* index for loads */ |
833 | mov $6, %r9d /* byte position left over from less32bytes case */ |
834 | /* |
835 | * Setup %r10 value allows us to detect crossing a page boundary. |
836 | * When %r10 goes positive we have crossed a page boundary and |
837 | * need to do a nibble. |
838 | */ |
839 | lea 6(%rdi), %r10 |
840 | and $0xfff, %r10 /* offset into 4K page */ |
841 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
842 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
843 | |
844 | .p2align 4 |
845 | LABEL(loop_ashr_6_use): |
846 | add $16, %r10 |
847 | jg LABEL(nibble_ashr_6_use) |
848 | |
849 | LABEL(nibble_ashr_6_restart_use): |
850 | movdqa (%rdi, %rdx), %xmm0 |
851 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
852 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
853 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
854 | #else |
855 | movdqa (%rsi,%rdx), %xmm1 |
856 | TOLOWER (%xmm0, %xmm1) |
857 | pcmpistri $0x1a, %xmm1, %xmm0 |
858 | #endif |
859 | jbe LABEL(exit_use) |
860 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
861 | sub $16, %r11 |
862 | jbe LABEL(strcmp_exitz) |
863 | #endif |
864 | |
865 | add $16, %rdx |
866 | add $16, %r10 |
867 | jg LABEL(nibble_ashr_6_use) |
868 | |
869 | movdqa (%rdi, %rdx), %xmm0 |
870 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
871 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
872 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
873 | #else |
874 | movdqa (%rsi,%rdx), %xmm1 |
875 | TOLOWER (%xmm0, %xmm1) |
876 | pcmpistri $0x1a, %xmm1, %xmm0 |
877 | #endif |
878 | jbe LABEL(exit_use) |
879 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
880 | sub $16, %r11 |
881 | jbe LABEL(strcmp_exitz) |
882 | #endif |
883 | add $16, %rdx |
884 | jmp LABEL(loop_ashr_6_use) |
885 | |
886 | .p2align 4 |
887 | LABEL(nibble_ashr_6_use): |
888 | sub $0x1000, %r10 |
889 | movdqa -16(%rdi, %rdx), %xmm0 |
890 | psrldq $6, D(%xmm0) |
891 | pcmpistri $0x3a,%xmm0, %xmm0 |
892 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
893 | cmp %r11, %rcx |
894 | jae LABEL(nibble_ashr_exit_use) |
895 | #endif |
896 | cmp $9, %ecx |
897 | ja LABEL(nibble_ashr_6_restart_use) |
898 | |
899 | jmp LABEL(nibble_ashr_exit_use) |
900 | |
901 | /* |
902 | * The following cases will be handled by ashr_7 |
903 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
904 | * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 |
905 | */ |
906 | .p2align 4 |
907 | LABEL(ashr_7): |
908 | pslldq $9, D(%xmm2) |
909 | TOLOWER (%xmm1, %xmm2) |
910 | pcmpeqb %xmm1, D(%xmm2) |
911 | psubb %xmm0, D(%xmm2) |
912 | pmovmskb %xmm2, %r9d |
913 | shr %cl, %edx |
914 | shr %cl, %r9d |
915 | sub %r9d, %edx |
916 | jnz LABEL(less32bytes) |
917 | movdqa (%rdi), %xmm3 |
918 | |
919 | UPDATE_STRNCMP_COUNTER |
920 | |
921 | mov $16, %rcx /* index for loads */ |
922 | mov $7, %r9d /* byte position left over from less32bytes case */ |
923 | /* |
924 | * Setup %r10 value allows us to detect crossing a page boundary. |
925 | * When %r10 goes positive we have crossed a page boundary and |
926 | * need to do a nibble. |
927 | */ |
928 | lea 7(%rdi), %r10 |
929 | and $0xfff, %r10 /* offset into 4K page */ |
930 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
931 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
932 | |
933 | .p2align 4 |
934 | LABEL(loop_ashr_7_use): |
935 | add $16, %r10 |
936 | jg LABEL(nibble_ashr_7_use) |
937 | |
938 | LABEL(nibble_ashr_7_restart_use): |
939 | movdqa (%rdi, %rdx), %xmm0 |
940 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
941 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
942 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
943 | #else |
944 | movdqa (%rsi,%rdx), %xmm1 |
945 | TOLOWER (%xmm0, %xmm1) |
946 | pcmpistri $0x1a, %xmm1, %xmm0 |
947 | #endif |
948 | jbe LABEL(exit_use) |
949 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
950 | sub $16, %r11 |
951 | jbe LABEL(strcmp_exitz) |
952 | #endif |
953 | |
954 | add $16, %rdx |
955 | add $16, %r10 |
956 | jg LABEL(nibble_ashr_7_use) |
957 | |
958 | movdqa (%rdi, %rdx), %xmm0 |
959 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
960 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
961 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
962 | #else |
963 | movdqa (%rsi,%rdx), %xmm1 |
964 | TOLOWER (%xmm0, %xmm1) |
965 | pcmpistri $0x1a, %xmm1, %xmm0 |
966 | #endif |
967 | jbe LABEL(exit_use) |
968 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
969 | sub $16, %r11 |
970 | jbe LABEL(strcmp_exitz) |
971 | #endif |
972 | add $16, %rdx |
973 | jmp LABEL(loop_ashr_7_use) |
974 | |
975 | .p2align 4 |
976 | LABEL(nibble_ashr_7_use): |
977 | sub $0x1000, %r10 |
978 | movdqa -16(%rdi, %rdx), %xmm0 |
979 | psrldq $7, D(%xmm0) |
980 | pcmpistri $0x3a,%xmm0, %xmm0 |
981 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
982 | cmp %r11, %rcx |
983 | jae LABEL(nibble_ashr_exit_use) |
984 | #endif |
985 | cmp $8, %ecx |
986 | ja LABEL(nibble_ashr_7_restart_use) |
987 | |
988 | jmp LABEL(nibble_ashr_exit_use) |
989 | |
990 | /* |
991 | * The following cases will be handled by ashr_8 |
992 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
993 | * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 |
994 | */ |
995 | .p2align 4 |
996 | LABEL(ashr_8): |
997 | pslldq $8, D(%xmm2) |
998 | TOLOWER (%xmm1, %xmm2) |
999 | pcmpeqb %xmm1, D(%xmm2) |
1000 | psubb %xmm0, D(%xmm2) |
1001 | pmovmskb %xmm2, %r9d |
1002 | shr %cl, %edx |
1003 | shr %cl, %r9d |
1004 | sub %r9d, %edx |
1005 | jnz LABEL(less32bytes) |
1006 | movdqa (%rdi), %xmm3 |
1007 | |
1008 | UPDATE_STRNCMP_COUNTER |
1009 | |
1010 | mov $16, %rcx /* index for loads */ |
1011 | mov $8, %r9d /* byte position left over from less32bytes case */ |
1012 | /* |
1013 | * Setup %r10 value allows us to detect crossing a page boundary. |
1014 | * When %r10 goes positive we have crossed a page boundary and |
1015 | * need to do a nibble. |
1016 | */ |
1017 | lea 8(%rdi), %r10 |
1018 | and $0xfff, %r10 /* offset into 4K page */ |
1019 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1020 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1021 | |
1022 | .p2align 4 |
1023 | LABEL(loop_ashr_8_use): |
1024 | add $16, %r10 |
1025 | jg LABEL(nibble_ashr_8_use) |
1026 | |
1027 | LABEL(nibble_ashr_8_restart_use): |
1028 | movdqa (%rdi, %rdx), %xmm0 |
1029 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
1030 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1031 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1032 | #else |
1033 | movdqa (%rsi,%rdx), %xmm1 |
1034 | TOLOWER (%xmm0, %xmm1) |
1035 | pcmpistri $0x1a, %xmm1, %xmm0 |
1036 | #endif |
1037 | jbe LABEL(exit_use) |
1038 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1039 | sub $16, %r11 |
1040 | jbe LABEL(strcmp_exitz) |
1041 | #endif |
1042 | |
1043 | add $16, %rdx |
1044 | add $16, %r10 |
1045 | jg LABEL(nibble_ashr_8_use) |
1046 | |
1047 | movdqa (%rdi, %rdx), %xmm0 |
1048 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
1049 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1050 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1051 | #else |
1052 | movdqa (%rsi,%rdx), %xmm1 |
1053 | TOLOWER (%xmm0, %xmm1) |
1054 | pcmpistri $0x1a, %xmm1, %xmm0 |
1055 | #endif |
1056 | jbe LABEL(exit_use) |
1057 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1058 | sub $16, %r11 |
1059 | jbe LABEL(strcmp_exitz) |
1060 | #endif |
1061 | add $16, %rdx |
1062 | jmp LABEL(loop_ashr_8_use) |
1063 | |
1064 | .p2align 4 |
1065 | LABEL(nibble_ashr_8_use): |
1066 | sub $0x1000, %r10 |
1067 | movdqa -16(%rdi, %rdx), %xmm0 |
1068 | psrldq $8, D(%xmm0) |
1069 | pcmpistri $0x3a,%xmm0, %xmm0 |
1070 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1071 | cmp %r11, %rcx |
1072 | jae LABEL(nibble_ashr_exit_use) |
1073 | #endif |
1074 | cmp $7, %ecx |
1075 | ja LABEL(nibble_ashr_8_restart_use) |
1076 | |
1077 | jmp LABEL(nibble_ashr_exit_use) |
1078 | |
1079 | /* |
1080 | * The following cases will be handled by ashr_9 |
1081 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1082 | * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 |
1083 | */ |
1084 | .p2align 4 |
1085 | LABEL(ashr_9): |
1086 | pslldq $7, D(%xmm2) |
1087 | TOLOWER (%xmm1, %xmm2) |
1088 | pcmpeqb %xmm1, D(%xmm2) |
1089 | psubb %xmm0, D(%xmm2) |
1090 | pmovmskb %xmm2, %r9d |
1091 | shr %cl, %edx |
1092 | shr %cl, %r9d |
1093 | sub %r9d, %edx |
1094 | jnz LABEL(less32bytes) |
1095 | movdqa (%rdi), %xmm3 |
1096 | |
1097 | UPDATE_STRNCMP_COUNTER |
1098 | |
1099 | mov $16, %rcx /* index for loads */ |
1100 | mov $9, %r9d /* byte position left over from less32bytes case */ |
1101 | /* |
1102 | * Setup %r10 value allows us to detect crossing a page boundary. |
1103 | * When %r10 goes positive we have crossed a page boundary and |
1104 | * need to do a nibble. |
1105 | */ |
1106 | lea 9(%rdi), %r10 |
1107 | and $0xfff, %r10 /* offset into 4K page */ |
1108 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1109 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1110 | |
1111 | .p2align 4 |
1112 | LABEL(loop_ashr_9_use): |
1113 | add $16, %r10 |
1114 | jg LABEL(nibble_ashr_9_use) |
1115 | |
1116 | LABEL(nibble_ashr_9_restart_use): |
1117 | movdqa (%rdi, %rdx), %xmm0 |
1118 | |
1119 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
1120 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1121 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1122 | #else |
1123 | movdqa (%rsi,%rdx), %xmm1 |
1124 | TOLOWER (%xmm0, %xmm1) |
1125 | pcmpistri $0x1a, %xmm1, %xmm0 |
1126 | #endif |
1127 | jbe LABEL(exit_use) |
1128 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1129 | sub $16, %r11 |
1130 | jbe LABEL(strcmp_exitz) |
1131 | #endif |
1132 | |
1133 | add $16, %rdx |
1134 | add $16, %r10 |
1135 | jg LABEL(nibble_ashr_9_use) |
1136 | |
1137 | movdqa (%rdi, %rdx), %xmm0 |
1138 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
1139 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1140 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1141 | #else |
1142 | movdqa (%rsi,%rdx), %xmm1 |
1143 | TOLOWER (%xmm0, %xmm1) |
1144 | pcmpistri $0x1a, %xmm1, %xmm0 |
1145 | #endif |
1146 | jbe LABEL(exit_use) |
1147 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1148 | sub $16, %r11 |
1149 | jbe LABEL(strcmp_exitz) |
1150 | #endif |
1151 | add $16, %rdx |
1152 | jmp LABEL(loop_ashr_9_use) |
1153 | |
1154 | .p2align 4 |
1155 | LABEL(nibble_ashr_9_use): |
1156 | sub $0x1000, %r10 |
1157 | movdqa -16(%rdi, %rdx), %xmm0 |
1158 | psrldq $9, D(%xmm0) |
1159 | pcmpistri $0x3a,%xmm0, %xmm0 |
1160 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1161 | cmp %r11, %rcx |
1162 | jae LABEL(nibble_ashr_exit_use) |
1163 | #endif |
1164 | cmp $6, %ecx |
1165 | ja LABEL(nibble_ashr_9_restart_use) |
1166 | |
1167 | jmp LABEL(nibble_ashr_exit_use) |
1168 | |
1169 | /* |
1170 | * The following cases will be handled by ashr_10 |
1171 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1172 | * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 |
1173 | */ |
1174 | .p2align 4 |
1175 | LABEL(ashr_10): |
1176 | pslldq $6, D(%xmm2) |
1177 | TOLOWER (%xmm1, %xmm2) |
1178 | pcmpeqb %xmm1, D(%xmm2) |
1179 | psubb %xmm0, D(%xmm2) |
1180 | pmovmskb %xmm2, %r9d |
1181 | shr %cl, %edx |
1182 | shr %cl, %r9d |
1183 | sub %r9d, %edx |
1184 | jnz LABEL(less32bytes) |
1185 | movdqa (%rdi), %xmm3 |
1186 | |
1187 | UPDATE_STRNCMP_COUNTER |
1188 | |
1189 | mov $16, %rcx /* index for loads */ |
1190 | mov $10, %r9d /* byte position left over from less32bytes case */ |
1191 | /* |
1192 | * Setup %r10 value allows us to detect crossing a page boundary. |
1193 | * When %r10 goes positive we have crossed a page boundary and |
1194 | * need to do a nibble. |
1195 | */ |
1196 | lea 10(%rdi), %r10 |
1197 | and $0xfff, %r10 /* offset into 4K page */ |
1198 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1199 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1200 | |
1201 | .p2align 4 |
1202 | LABEL(loop_ashr_10_use): |
1203 | add $16, %r10 |
1204 | jg LABEL(nibble_ashr_10_use) |
1205 | |
1206 | LABEL(nibble_ashr_10_restart_use): |
1207 | movdqa (%rdi, %rdx), %xmm0 |
1208 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
1209 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1210 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1211 | #else |
1212 | movdqa (%rsi,%rdx), %xmm1 |
1213 | TOLOWER (%xmm0, %xmm1) |
1214 | pcmpistri $0x1a, %xmm1, %xmm0 |
1215 | #endif |
1216 | jbe LABEL(exit_use) |
1217 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1218 | sub $16, %r11 |
1219 | jbe LABEL(strcmp_exitz) |
1220 | #endif |
1221 | |
1222 | add $16, %rdx |
1223 | add $16, %r10 |
1224 | jg LABEL(nibble_ashr_10_use) |
1225 | |
1226 | movdqa (%rdi, %rdx), %xmm0 |
1227 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
1228 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1229 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1230 | #else |
1231 | movdqa (%rsi,%rdx), %xmm1 |
1232 | TOLOWER (%xmm0, %xmm1) |
1233 | pcmpistri $0x1a, %xmm1, %xmm0 |
1234 | #endif |
1235 | jbe LABEL(exit_use) |
1236 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1237 | sub $16, %r11 |
1238 | jbe LABEL(strcmp_exitz) |
1239 | #endif |
1240 | add $16, %rdx |
1241 | jmp LABEL(loop_ashr_10_use) |
1242 | |
1243 | .p2align 4 |
1244 | LABEL(nibble_ashr_10_use): |
1245 | sub $0x1000, %r10 |
1246 | movdqa -16(%rdi, %rdx), %xmm0 |
1247 | psrldq $10, D(%xmm0) |
1248 | pcmpistri $0x3a,%xmm0, %xmm0 |
1249 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1250 | cmp %r11, %rcx |
1251 | jae LABEL(nibble_ashr_exit_use) |
1252 | #endif |
1253 | cmp $5, %ecx |
1254 | ja LABEL(nibble_ashr_10_restart_use) |
1255 | |
1256 | jmp LABEL(nibble_ashr_exit_use) |
1257 | |
1258 | /* |
1259 | * The following cases will be handled by ashr_11 |
1260 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1261 | * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 |
1262 | */ |
1263 | .p2align 4 |
1264 | LABEL(ashr_11): |
1265 | pslldq $5, D(%xmm2) |
1266 | TOLOWER (%xmm1, %xmm2) |
1267 | pcmpeqb %xmm1, D(%xmm2) |
1268 | psubb %xmm0, D(%xmm2) |
1269 | pmovmskb %xmm2, %r9d |
1270 | shr %cl, %edx |
1271 | shr %cl, %r9d |
1272 | sub %r9d, %edx |
1273 | jnz LABEL(less32bytes) |
1274 | movdqa (%rdi), %xmm3 |
1275 | |
1276 | UPDATE_STRNCMP_COUNTER |
1277 | |
1278 | mov $16, %rcx /* index for loads */ |
1279 | mov $11, %r9d /* byte position left over from less32bytes case */ |
1280 | /* |
1281 | * Setup %r10 value allows us to detect crossing a page boundary. |
1282 | * When %r10 goes positive we have crossed a page boundary and |
1283 | * need to do a nibble. |
1284 | */ |
1285 | lea 11(%rdi), %r10 |
1286 | and $0xfff, %r10 /* offset into 4K page */ |
1287 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1288 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1289 | |
1290 | .p2align 4 |
1291 | LABEL(loop_ashr_11_use): |
1292 | add $16, %r10 |
1293 | jg LABEL(nibble_ashr_11_use) |
1294 | |
1295 | LABEL(nibble_ashr_11_restart_use): |
1296 | movdqa (%rdi, %rdx), %xmm0 |
1297 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
1298 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1299 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1300 | #else |
1301 | movdqa (%rsi,%rdx), %xmm1 |
1302 | TOLOWER (%xmm0, %xmm1) |
1303 | pcmpistri $0x1a, %xmm1, %xmm0 |
1304 | #endif |
1305 | jbe LABEL(exit_use) |
1306 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1307 | sub $16, %r11 |
1308 | jbe LABEL(strcmp_exitz) |
1309 | #endif |
1310 | |
1311 | add $16, %rdx |
1312 | add $16, %r10 |
1313 | jg LABEL(nibble_ashr_11_use) |
1314 | |
1315 | movdqa (%rdi, %rdx), %xmm0 |
1316 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
1317 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1318 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1319 | #else |
1320 | movdqa (%rsi,%rdx), %xmm1 |
1321 | TOLOWER (%xmm0, %xmm1) |
1322 | pcmpistri $0x1a, %xmm1, %xmm0 |
1323 | #endif |
1324 | jbe LABEL(exit_use) |
1325 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1326 | sub $16, %r11 |
1327 | jbe LABEL(strcmp_exitz) |
1328 | #endif |
1329 | add $16, %rdx |
1330 | jmp LABEL(loop_ashr_11_use) |
1331 | |
1332 | .p2align 4 |
1333 | LABEL(nibble_ashr_11_use): |
1334 | sub $0x1000, %r10 |
1335 | movdqa -16(%rdi, %rdx), %xmm0 |
1336 | psrldq $11, D(%xmm0) |
1337 | pcmpistri $0x3a,%xmm0, %xmm0 |
1338 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1339 | cmp %r11, %rcx |
1340 | jae LABEL(nibble_ashr_exit_use) |
1341 | #endif |
1342 | cmp $4, %ecx |
1343 | ja LABEL(nibble_ashr_11_restart_use) |
1344 | |
1345 | jmp LABEL(nibble_ashr_exit_use) |
1346 | |
1347 | /* |
1348 | * The following cases will be handled by ashr_12 |
1349 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1350 | * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 |
1351 | */ |
1352 | .p2align 4 |
1353 | LABEL(ashr_12): |
1354 | pslldq $4, D(%xmm2) |
1355 | TOLOWER (%xmm1, %xmm2) |
1356 | pcmpeqb %xmm1, D(%xmm2) |
1357 | psubb %xmm0, D(%xmm2) |
1358 | pmovmskb %xmm2, %r9d |
1359 | shr %cl, %edx |
1360 | shr %cl, %r9d |
1361 | sub %r9d, %edx |
1362 | jnz LABEL(less32bytes) |
1363 | movdqa (%rdi), %xmm3 |
1364 | |
1365 | UPDATE_STRNCMP_COUNTER |
1366 | |
1367 | mov $16, %rcx /* index for loads */ |
1368 | mov $12, %r9d /* byte position left over from less32bytes case */ |
1369 | /* |
1370 | * Setup %r10 value allows us to detect crossing a page boundary. |
1371 | * When %r10 goes positive we have crossed a page boundary and |
1372 | * need to do a nibble. |
1373 | */ |
1374 | lea 12(%rdi), %r10 |
1375 | and $0xfff, %r10 /* offset into 4K page */ |
1376 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1377 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1378 | |
1379 | .p2align 4 |
1380 | LABEL(loop_ashr_12_use): |
1381 | add $16, %r10 |
1382 | jg LABEL(nibble_ashr_12_use) |
1383 | |
1384 | LABEL(nibble_ashr_12_restart_use): |
1385 | movdqa (%rdi, %rdx), %xmm0 |
1386 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
1387 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1388 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1389 | #else |
1390 | movdqa (%rsi,%rdx), %xmm1 |
1391 | TOLOWER (%xmm0, %xmm1) |
1392 | pcmpistri $0x1a, %xmm1, %xmm0 |
1393 | #endif |
1394 | jbe LABEL(exit_use) |
1395 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1396 | sub $16, %r11 |
1397 | jbe LABEL(strcmp_exitz) |
1398 | #endif |
1399 | |
1400 | add $16, %rdx |
1401 | add $16, %r10 |
1402 | jg LABEL(nibble_ashr_12_use) |
1403 | |
1404 | movdqa (%rdi, %rdx), %xmm0 |
1405 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
1406 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1407 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1408 | #else |
1409 | movdqa (%rsi,%rdx), %xmm1 |
1410 | TOLOWER (%xmm0, %xmm1) |
1411 | pcmpistri $0x1a, %xmm1, %xmm0 |
1412 | #endif |
1413 | jbe LABEL(exit_use) |
1414 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1415 | sub $16, %r11 |
1416 | jbe LABEL(strcmp_exitz) |
1417 | #endif |
1418 | add $16, %rdx |
1419 | jmp LABEL(loop_ashr_12_use) |
1420 | |
1421 | .p2align 4 |
1422 | LABEL(nibble_ashr_12_use): |
1423 | sub $0x1000, %r10 |
1424 | movdqa -16(%rdi, %rdx), %xmm0 |
1425 | psrldq $12, D(%xmm0) |
1426 | pcmpistri $0x3a,%xmm0, %xmm0 |
1427 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1428 | cmp %r11, %rcx |
1429 | jae LABEL(nibble_ashr_exit_use) |
1430 | #endif |
1431 | cmp $3, %ecx |
1432 | ja LABEL(nibble_ashr_12_restart_use) |
1433 | |
1434 | jmp LABEL(nibble_ashr_exit_use) |
1435 | |
1436 | /* |
1437 | * The following cases will be handled by ashr_13 |
1438 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1439 | * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 |
1440 | */ |
1441 | .p2align 4 |
1442 | LABEL(ashr_13): |
1443 | pslldq $3, D(%xmm2) |
1444 | TOLOWER (%xmm1, %xmm2) |
1445 | pcmpeqb %xmm1, D(%xmm2) |
1446 | psubb %xmm0, D(%xmm2) |
1447 | pmovmskb %xmm2, %r9d |
1448 | shr %cl, %edx |
1449 | shr %cl, %r9d |
1450 | sub %r9d, %edx |
1451 | jnz LABEL(less32bytes) |
1452 | movdqa (%rdi), %xmm3 |
1453 | |
1454 | UPDATE_STRNCMP_COUNTER |
1455 | |
1456 | mov $16, %rcx /* index for loads */ |
1457 | mov $13, %r9d /* byte position left over from less32bytes case */ |
1458 | /* |
1459 | * Setup %r10 value allows us to detect crossing a page boundary. |
1460 | * When %r10 goes positive we have crossed a page boundary and |
1461 | * need to do a nibble. |
1462 | */ |
1463 | lea 13(%rdi), %r10 |
1464 | and $0xfff, %r10 /* offset into 4K page */ |
1465 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1466 | |
1467 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1468 | |
1469 | .p2align 4 |
1470 | LABEL(loop_ashr_13_use): |
1471 | add $16, %r10 |
1472 | jg LABEL(nibble_ashr_13_use) |
1473 | |
1474 | LABEL(nibble_ashr_13_restart_use): |
1475 | movdqa (%rdi, %rdx), %xmm0 |
1476 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
1477 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1478 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1479 | #else |
1480 | movdqa (%rsi,%rdx), %xmm1 |
1481 | TOLOWER (%xmm0, %xmm1) |
1482 | pcmpistri $0x1a, %xmm1, %xmm0 |
1483 | #endif |
1484 | jbe LABEL(exit_use) |
1485 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1486 | sub $16, %r11 |
1487 | jbe LABEL(strcmp_exitz) |
1488 | #endif |
1489 | |
1490 | add $16, %rdx |
1491 | add $16, %r10 |
1492 | jg LABEL(nibble_ashr_13_use) |
1493 | |
1494 | movdqa (%rdi, %rdx), %xmm0 |
1495 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
1496 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1497 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1498 | #else |
1499 | movdqa (%rsi,%rdx), %xmm1 |
1500 | TOLOWER (%xmm0, %xmm1) |
1501 | pcmpistri $0x1a, %xmm1, %xmm0 |
1502 | #endif |
1503 | jbe LABEL(exit_use) |
1504 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1505 | sub $16, %r11 |
1506 | jbe LABEL(strcmp_exitz) |
1507 | #endif |
1508 | add $16, %rdx |
1509 | jmp LABEL(loop_ashr_13_use) |
1510 | |
1511 | .p2align 4 |
1512 | LABEL(nibble_ashr_13_use): |
1513 | sub $0x1000, %r10 |
1514 | movdqa -16(%rdi, %rdx), %xmm0 |
1515 | psrldq $13, D(%xmm0) |
1516 | pcmpistri $0x3a,%xmm0, %xmm0 |
1517 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1518 | cmp %r11, %rcx |
1519 | jae LABEL(nibble_ashr_exit_use) |
1520 | #endif |
1521 | cmp $2, %ecx |
1522 | ja LABEL(nibble_ashr_13_restart_use) |
1523 | |
1524 | jmp LABEL(nibble_ashr_exit_use) |
1525 | |
1526 | /* |
1527 | * The following cases will be handled by ashr_14 |
1528 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1529 | * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 |
1530 | */ |
1531 | .p2align 4 |
1532 | LABEL(ashr_14): |
1533 | pslldq $2, D(%xmm2) |
1534 | TOLOWER (%xmm1, %xmm2) |
1535 | pcmpeqb %xmm1, D(%xmm2) |
1536 | psubb %xmm0, D(%xmm2) |
1537 | pmovmskb %xmm2, %r9d |
1538 | shr %cl, %edx |
1539 | shr %cl, %r9d |
1540 | sub %r9d, %edx |
1541 | jnz LABEL(less32bytes) |
1542 | movdqa (%rdi), %xmm3 |
1543 | |
1544 | UPDATE_STRNCMP_COUNTER |
1545 | |
1546 | mov $16, %rcx /* index for loads */ |
1547 | mov $14, %r9d /* byte position left over from less32bytes case */ |
1548 | /* |
1549 | * Setup %r10 value allows us to detect crossing a page boundary. |
1550 | * When %r10 goes positive we have crossed a page boundary and |
1551 | * need to do a nibble. |
1552 | */ |
1553 | lea 14(%rdi), %r10 |
1554 | and $0xfff, %r10 /* offset into 4K page */ |
1555 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1556 | |
1557 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1558 | |
1559 | .p2align 4 |
1560 | LABEL(loop_ashr_14_use): |
1561 | add $16, %r10 |
1562 | jg LABEL(nibble_ashr_14_use) |
1563 | |
1564 | LABEL(nibble_ashr_14_restart_use): |
1565 | movdqa (%rdi, %rdx), %xmm0 |
1566 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
1567 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1568 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1569 | #else |
1570 | movdqa (%rsi,%rdx), %xmm1 |
1571 | TOLOWER (%xmm0, %xmm1) |
1572 | pcmpistri $0x1a, %xmm1, %xmm0 |
1573 | #endif |
1574 | jbe LABEL(exit_use) |
1575 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1576 | sub $16, %r11 |
1577 | jbe LABEL(strcmp_exitz) |
1578 | #endif |
1579 | |
1580 | add $16, %rdx |
1581 | add $16, %r10 |
1582 | jg LABEL(nibble_ashr_14_use) |
1583 | |
1584 | movdqa (%rdi, %rdx), %xmm0 |
1585 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
1586 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1587 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1588 | #else |
1589 | movdqa (%rsi,%rdx), %xmm1 |
1590 | TOLOWER (%xmm0, %xmm1) |
1591 | pcmpistri $0x1a, %xmm1, %xmm0 |
1592 | #endif |
1593 | jbe LABEL(exit_use) |
1594 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1595 | sub $16, %r11 |
1596 | jbe LABEL(strcmp_exitz) |
1597 | #endif |
1598 | add $16, %rdx |
1599 | jmp LABEL(loop_ashr_14_use) |
1600 | |
1601 | .p2align 4 |
1602 | LABEL(nibble_ashr_14_use): |
1603 | sub $0x1000, %r10 |
1604 | movdqa -16(%rdi, %rdx), %xmm0 |
1605 | psrldq $14, D(%xmm0) |
1606 | pcmpistri $0x3a,%xmm0, %xmm0 |
1607 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1608 | cmp %r11, %rcx |
1609 | jae LABEL(nibble_ashr_exit_use) |
1610 | #endif |
1611 | cmp $1, %ecx |
1612 | ja LABEL(nibble_ashr_14_restart_use) |
1613 | |
1614 | jmp LABEL(nibble_ashr_exit_use) |
1615 | |
1616 | /* |
1617 | * The following cases will be handled by ashr_15 |
1618 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1619 | * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 |
1620 | */ |
1621 | .p2align 4 |
1622 | LABEL(ashr_15): |
1623 | pslldq $1, D(%xmm2) |
1624 | TOLOWER (%xmm1, %xmm2) |
1625 | pcmpeqb %xmm1, D(%xmm2) |
1626 | psubb %xmm0, D(%xmm2) |
1627 | pmovmskb %xmm2, %r9d |
1628 | shr %cl, %edx |
1629 | shr %cl, %r9d |
1630 | sub %r9d, %edx |
1631 | jnz LABEL(less32bytes) |
1632 | |
1633 | movdqa (%rdi), %xmm3 |
1634 | |
1635 | UPDATE_STRNCMP_COUNTER |
1636 | |
1637 | mov $16, %rcx /* index for loads */ |
1638 | mov $15, %r9d /* byte position left over from less32bytes case */ |
1639 | /* |
1640 | * Setup %r10 value allows us to detect crossing a page boundary. |
1641 | * When %r10 goes positive we have crossed a page boundary and |
1642 | * need to do a nibble. |
1643 | */ |
1644 | lea 15(%rdi), %r10 |
1645 | and $0xfff, %r10 /* offset into 4K page */ |
1646 | |
1647 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1648 | |
1649 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1650 | |
1651 | .p2align 4 |
1652 | LABEL(loop_ashr_15_use): |
1653 | add $16, %r10 |
1654 | jg LABEL(nibble_ashr_15_use) |
1655 | |
1656 | LABEL(nibble_ashr_15_restart_use): |
1657 | movdqa (%rdi, %rdx), %xmm0 |
1658 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
1659 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1660 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1661 | #else |
1662 | movdqa (%rsi,%rdx), %xmm1 |
1663 | TOLOWER (%xmm0, %xmm1) |
1664 | pcmpistri $0x1a, %xmm1, %xmm0 |
1665 | #endif |
1666 | jbe LABEL(exit_use) |
1667 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1668 | sub $16, %r11 |
1669 | jbe LABEL(strcmp_exitz) |
1670 | #endif |
1671 | |
1672 | add $16, %rdx |
1673 | add $16, %r10 |
1674 | jg LABEL(nibble_ashr_15_use) |
1675 | |
1676 | movdqa (%rdi, %rdx), %xmm0 |
1677 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
1678 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1679 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1680 | #else |
1681 | movdqa (%rsi,%rdx), %xmm1 |
1682 | TOLOWER (%xmm0, %xmm1) |
1683 | pcmpistri $0x1a, %xmm1, %xmm0 |
1684 | #endif |
1685 | jbe LABEL(exit_use) |
1686 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1687 | sub $16, %r11 |
1688 | jbe LABEL(strcmp_exitz) |
1689 | #endif |
1690 | add $16, %rdx |
1691 | jmp LABEL(loop_ashr_15_use) |
1692 | |
1693 | .p2align 4 |
1694 | LABEL(nibble_ashr_15_use): |
1695 | sub $0x1000, %r10 |
1696 | movdqa -16(%rdi, %rdx), %xmm0 |
1697 | psrldq $15, D(%xmm0) |
1698 | pcmpistri $0x3a,%xmm0, %xmm0 |
1699 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1700 | cmp %r11, %rcx |
1701 | jae LABEL(nibble_ashr_exit_use) |
1702 | #endif |
1703 | cmp $0, %ecx |
1704 | ja LABEL(nibble_ashr_15_restart_use) |
1705 | |
1706 | LABEL(nibble_ashr_exit_use): |
1707 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1708 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
1709 | #else |
1710 | movdqa (%rsi,%rdx), %xmm1 |
1711 | TOLOWER (%xmm0, %xmm1) |
1712 | pcmpistri $0x1a, %xmm1, %xmm0 |
1713 | #endif |
1714 | .p2align 4 |
1715 | LABEL(exit_use): |
1716 | jnc LABEL(strcmp_exitz) |
1717 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1718 | sub %rcx, %r11 |
1719 | jbe LABEL(strcmp_exitz) |
1720 | #endif |
1721 | add %rcx, %rdx |
1722 | lea -16(%rdi, %r9), %rdi |
1723 | movzbl (%rdi, %rdx), %eax |
1724 | movzbl (%rsi, %rdx), %edx |
1725 | test %r8d, %r8d |
1726 | jz LABEL(ret_use) |
1727 | xchg %eax, %edx |
1728 | LABEL(ret_use): |
1729 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1730 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
1731 | movl (%rcx,%rdx,4), %edx |
1732 | movl (%rcx,%rax,4), %eax |
1733 | #endif |
1734 | |
1735 | sub %edx, %eax |
1736 | ret |
1737 | |
1738 | LABEL(less32bytes): |
1739 | lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ |
1740 | lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ |
1741 | test %r8d, %r8d |
1742 | jz LABEL(ret) |
1743 | xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ |
1744 | |
1745 | .p2align 4 |
1746 | LABEL(ret): |
1747 | LABEL(less16bytes): |
1748 | bsf %rdx, %rdx /* find and store bit index in %rdx */ |
1749 | |
1750 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1751 | sub %rdx, %r11 |
1752 | jbe LABEL(strcmp_exitz) |
1753 | #endif |
1754 | movzbl (%rsi, %rdx), %ecx |
1755 | movzbl (%rdi, %rdx), %eax |
1756 | |
1757 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1758 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1759 | movl (%rdx,%rcx,4), %ecx |
1760 | movl (%rdx,%rax,4), %eax |
1761 | #endif |
1762 | |
1763 | sub %ecx, %eax |
1764 | ret |
1765 | |
1766 | LABEL(strcmp_exitz): |
1767 | xor %eax, %eax |
1768 | ret |
1769 | |
1770 | .p2align 4 |
1771 | // XXX Same as code above |
1772 | LABEL(Byte0): |
1773 | movzbl (%rsi), %ecx |
1774 | movzbl (%rdi), %eax |
1775 | |
1776 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1777 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1778 | movl (%rdx,%rcx,4), %ecx |
1779 | movl (%rdx,%rax,4), %eax |
1780 | #endif |
1781 | |
1782 | sub %ecx, %eax |
1783 | ret |
1784 | cfi_endproc |
1785 | .size STRCMP_SSE42, .-STRCMP_SSE42 |
1786 | |
1787 | #undef UCLOW_reg |
1788 | #undef UCHIGH_reg |
1789 | #undef LCQWORD_reg |
1790 | #undef TOLOWER |
1791 | |
1792 | /* Put all SSE 4.2 functions together. */ |
1793 | .section .rodata.SECTION,"a" ,@progbits |
1794 | .p2align 3 |
1795 | LABEL(unaligned_table): |
1796 | .int LABEL(ashr_1) - LABEL(unaligned_table) |
1797 | .int LABEL(ashr_2) - LABEL(unaligned_table) |
1798 | .int LABEL(ashr_3) - LABEL(unaligned_table) |
1799 | .int LABEL(ashr_4) - LABEL(unaligned_table) |
1800 | .int LABEL(ashr_5) - LABEL(unaligned_table) |
1801 | .int LABEL(ashr_6) - LABEL(unaligned_table) |
1802 | .int LABEL(ashr_7) - LABEL(unaligned_table) |
1803 | .int LABEL(ashr_8) - LABEL(unaligned_table) |
1804 | .int LABEL(ashr_9) - LABEL(unaligned_table) |
1805 | .int LABEL(ashr_10) - LABEL(unaligned_table) |
1806 | .int LABEL(ashr_11) - LABEL(unaligned_table) |
1807 | .int LABEL(ashr_12) - LABEL(unaligned_table) |
1808 | .int LABEL(ashr_13) - LABEL(unaligned_table) |
1809 | .int LABEL(ashr_14) - LABEL(unaligned_table) |
1810 | .int LABEL(ashr_15) - LABEL(unaligned_table) |
1811 | .int LABEL(ashr_0) - LABEL(unaligned_table) |
1812 | |
1813 | #undef LABEL |
1814 | #undef GLABEL |
1815 | #undef SECTION |
1816 | #undef movdqa |
1817 | #undef movdqu |
1818 | #undef pmovmskb |
1819 | #undef pcmpistri |
1820 | #undef psubb |
1821 | #undef pcmpeqb |
1822 | #undef psrldq |
1823 | #undef pslldq |
1824 | #undef palignr |
1825 | #undef pxor |
1826 | #undef D |
1827 | |