1 | /* strcmp with SSE4.2 |
2 | Copyright (C) 2009-2021 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | #ifndef STRCMP_SSE42 |
23 | # define STRCMP_SSE42 __strcmp_sse42 |
24 | #endif |
25 | |
26 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
27 | # include "locale-defines.h" |
28 | #endif |
29 | |
30 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
31 | /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz |
32 | if the new counter > the old one or is 0. */ |
33 | # define UPDATE_STRNCMP_COUNTER \ |
34 | /* calculate left number to compare */ \ |
35 | lea -16(%rcx, %r11), %r9; \ |
36 | cmp %r9, %r11; \ |
37 | jb LABEL(strcmp_exitz); \ |
38 | test %r9, %r9; \ |
39 | je LABEL(strcmp_exitz); \ |
40 | mov %r9, %r11 |
41 | #else |
42 | # define UPDATE_STRNCMP_COUNTER |
43 | #endif |
44 | |
45 | #ifdef USE_AVX |
46 | # define SECTION avx |
47 | # define GLABEL(l) l##_avx |
48 | #else |
49 | # define SECTION sse4.2 |
50 | # define GLABEL(l) l##_sse42 |
51 | #endif |
52 | |
53 | #define LABEL(l) .L##l |
54 | |
55 | /* We use 0x1a: |
56 | _SIDD_SBYTE_OPS |
57 | | _SIDD_CMP_EQUAL_EACH |
58 | | _SIDD_NEGATIVE_POLARITY |
59 | | _SIDD_LEAST_SIGNIFICANT |
60 | on pcmpistri to find out if two 16byte data elements are the same |
61 | and the offset of the first different byte. There are 4 cases: |
62 | |
63 | 1. Both 16byte data elements are valid and identical. |
64 | 2. Both 16byte data elements have EOS and identical. |
65 | 3. Both 16byte data elements are valid and they differ at offset X. |
66 | 4. At least one 16byte data element has EOS at offset X. Two 16byte |
67 | data elements must differ at or before offset X. |
68 | |
69 | Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: |
70 | |
71 | case ECX CFlag ZFlag SFlag |
72 | 1 16 0 0 0 |
73 | 2 16 0 1 1 |
74 | 3 X 1 0 0 |
75 | 4 0 <= X 1 0/1 0/1 |
76 | |
77 | We exit from the loop for cases 2, 3 and 4 with jbe which branches |
78 | when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for |
79 | case 2. */ |
80 | |
81 | /* Put all SSE 4.2 functions together. */ |
82 | .section .text.SECTION,"ax" ,@progbits |
83 | .align 16 |
84 | .type STRCMP_SSE42, @function |
85 | .globl STRCMP_SSE42 |
86 | .hidden STRCMP_SSE42 |
87 | #ifdef USE_AS_STRCASECMP_L |
88 | ENTRY (GLABEL(__strcasecmp)) |
89 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
90 | mov %fs:(%rax),%RDX_LP |
91 | |
92 | // XXX 5 byte should be before the function |
93 | /* 5-byte NOP. */ |
94 | .byte 0x0f,0x1f,0x44,0x00,0x00 |
95 | END (GLABEL(__strcasecmp)) |
96 | /* FALLTHROUGH to strcasecmp_l. */ |
97 | #endif |
98 | #ifdef USE_AS_STRNCASECMP_L |
99 | ENTRY (GLABEL(__strncasecmp)) |
100 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
101 | mov %fs:(%rax),%RCX_LP |
102 | |
103 | // XXX 5 byte should be before the function |
104 | /* 5-byte NOP. */ |
105 | .byte 0x0f,0x1f,0x44,0x00,0x00 |
106 | END (GLABEL(__strncasecmp)) |
107 | /* FALLTHROUGH to strncasecmp_l. */ |
108 | #endif |
109 | |
110 | |
111 | #ifdef USE_AVX |
112 | # define movdqa vmovdqa |
113 | # define movdqu vmovdqu |
114 | # define pmovmskb vpmovmskb |
115 | # define pcmpistri vpcmpistri |
116 | # define psubb vpsubb |
117 | # define pcmpeqb vpcmpeqb |
118 | # define psrldq vpsrldq |
119 | # define pslldq vpslldq |
120 | # define palignr vpalignr |
121 | # define pxor vpxor |
122 | # define D(arg) arg, arg |
123 | #else |
124 | # define D(arg) arg |
125 | #endif |
126 | |
127 | STRCMP_SSE42: |
128 | cfi_startproc |
129 | _CET_ENDBR |
130 | CALL_MCOUNT |
131 | |
132 | /* |
133 | * This implementation uses SSE to compare up to 16 bytes at a time. |
134 | */ |
135 | #ifdef USE_AS_STRCASECMP_L |
136 | /* We have to fall back on the C implementation for locales |
137 | with encodings not matching ASCII for single bytes. */ |
138 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
139 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP |
140 | # else |
141 | mov (%rdx), %RAX_LP |
142 | # endif |
143 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
144 | jne __strcasecmp_l_nonascii |
145 | #endif |
146 | #ifdef USE_AS_STRNCASECMP_L |
147 | /* We have to fall back on the C implementation for locales |
148 | with encodings not matching ASCII for single bytes. */ |
149 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
150 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP |
151 | # else |
152 | mov (%rcx), %RAX_LP |
153 | # endif |
154 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
155 | jne __strncasecmp_l_nonascii |
156 | #endif |
157 | |
158 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
159 | test %RDX_LP, %RDX_LP |
160 | je LABEL(strcmp_exitz) |
161 | cmp $1, %RDX_LP |
162 | je LABEL(Byte0) |
163 | mov %RDX_LP, %R11_LP |
164 | #endif |
165 | mov %esi, %ecx |
166 | mov %edi, %eax |
167 | /* Use 64bit AND here to avoid long NOP padding. */ |
168 | and $0x3f, %rcx /* rsi alignment in cache line */ |
169 | and $0x3f, %rax /* rdi alignment in cache line */ |
170 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
171 | .section .rodata.cst16,"aM" ,@progbits,16 |
172 | .align 16 |
173 | LABEL(belowupper): |
174 | .quad 0x4040404040404040 |
175 | .quad 0x4040404040404040 |
176 | LABEL(topupper): |
177 | # ifdef USE_AVX |
178 | .quad 0x5a5a5a5a5a5a5a5a |
179 | .quad 0x5a5a5a5a5a5a5a5a |
180 | # else |
181 | .quad 0x5b5b5b5b5b5b5b5b |
182 | .quad 0x5b5b5b5b5b5b5b5b |
183 | # endif |
184 | LABEL(touppermask): |
185 | .quad 0x2020202020202020 |
186 | .quad 0x2020202020202020 |
187 | .previous |
188 | movdqa LABEL(belowupper)(%rip), %xmm4 |
189 | # define UCLOW_reg %xmm4 |
190 | movdqa LABEL(topupper)(%rip), %xmm5 |
191 | # define UCHIGH_reg %xmm5 |
192 | movdqa LABEL(touppermask)(%rip), %xmm6 |
193 | # define LCQWORD_reg %xmm6 |
194 | #endif |
195 | cmp $0x30, %ecx |
196 | ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ |
197 | cmp $0x30, %eax |
198 | ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ |
199 | movdqu (%rdi), %xmm1 |
200 | movdqu (%rsi), %xmm2 |
201 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
202 | # ifdef USE_AVX |
203 | # define TOLOWER(reg1, reg2) \ |
204 | vpcmpgtb UCLOW_reg, reg1, %xmm7; \ |
205 | vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ |
206 | vpcmpgtb UCLOW_reg, reg2, %xmm9; \ |
207 | vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ |
208 | vpandn %xmm7, %xmm8, %xmm8; \ |
209 | vpandn %xmm9, %xmm10, %xmm10; \ |
210 | vpand LCQWORD_reg, %xmm8, %xmm8; \ |
211 | vpand LCQWORD_reg, %xmm10, %xmm10; \ |
212 | vpor reg1, %xmm8, reg1; \ |
213 | vpor reg2, %xmm10, reg2 |
214 | # else |
215 | # define TOLOWER(reg1, reg2) \ |
216 | movdqa reg1, %xmm7; \ |
217 | movdqa UCHIGH_reg, %xmm8; \ |
218 | movdqa reg2, %xmm9; \ |
219 | movdqa UCHIGH_reg, %xmm10; \ |
220 | pcmpgtb UCLOW_reg, %xmm7; \ |
221 | pcmpgtb reg1, %xmm8; \ |
222 | pcmpgtb UCLOW_reg, %xmm9; \ |
223 | pcmpgtb reg2, %xmm10; \ |
224 | pand %xmm8, %xmm7; \ |
225 | pand %xmm10, %xmm9; \ |
226 | pand LCQWORD_reg, %xmm7; \ |
227 | pand LCQWORD_reg, %xmm9; \ |
228 | por %xmm7, reg1; \ |
229 | por %xmm9, reg2 |
230 | # endif |
231 | TOLOWER (%xmm1, %xmm2) |
232 | #else |
233 | # define TOLOWER(reg1, reg2) |
234 | #endif |
235 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ |
236 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
237 | pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ |
238 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
239 | pmovmskb %xmm1, %edx |
240 | sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ |
241 | jnz LABEL(less16bytes)/* If not, find different value or null char */ |
242 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
243 | sub $16, %r11 |
244 | jbe LABEL(strcmp_exitz)/* finish comparison */ |
245 | #endif |
246 | add $16, %rsi /* prepare to search next 16 bytes */ |
247 | add $16, %rdi /* prepare to search next 16 bytes */ |
248 | |
249 | /* |
250 | * Determine source and destination string offsets from 16-byte |
251 | * alignment. Use relative offset difference between the two to |
252 | * determine which case below to use. |
253 | */ |
254 | .p2align 4 |
255 | LABEL(crosscache): |
256 | and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ |
257 | and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ |
258 | mov $0xffff, %edx /* for equivalent offset */ |
259 | xor %r8d, %r8d |
260 | and $0xf, %ecx /* offset of rsi */ |
261 | and $0xf, %eax /* offset of rdi */ |
262 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ |
263 | cmp %eax, %ecx |
264 | je LABEL(ashr_0) /* rsi and rdi relative offset same */ |
265 | ja LABEL(bigger) |
266 | mov %edx, %r8d /* r8d is offset flag for exit tail */ |
267 | xchg %ecx, %eax |
268 | xchg %rsi, %rdi |
269 | LABEL(bigger): |
270 | movdqa (%rdi), %xmm2 |
271 | movdqa (%rsi), %xmm1 |
272 | lea 15(%rax), %r9 |
273 | sub %rcx, %r9 |
274 | lea LABEL(unaligned_table)(%rip), %r10 |
275 | movslq (%r10, %r9,4), %r9 |
276 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
277 | lea (%r10, %r9), %r10 |
278 | _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ |
279 | |
280 | /* |
281 | * The following cases will be handled by ashr_0 |
282 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
283 | * n(0~15) n(0~15) 15(15+ n-n) ashr_0 |
284 | */ |
285 | .p2align 4 |
286 | LABEL(ashr_0): |
287 | |
288 | movdqa (%rsi), %xmm1 |
289 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
290 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
291 | pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ |
292 | #else |
293 | movdqa (%rdi), %xmm2 |
294 | TOLOWER (%xmm1, %xmm2) |
295 | pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ |
296 | #endif |
297 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
298 | pmovmskb %xmm1, %r9d |
299 | shr %cl, %edx /* adjust 0xffff for offset */ |
300 | shr %cl, %r9d /* adjust for 16-byte offset */ |
301 | sub %r9d, %edx |
302 | /* |
303 | * edx must be the same with r9d if in left byte (16-rcx) is equal to |
304 | * the start from (16-rax) and no null char was seen. |
305 | */ |
306 | jne LABEL(less32bytes) /* mismatch or null char */ |
307 | UPDATE_STRNCMP_COUNTER |
308 | mov $16, %rcx |
309 | mov $16, %r9 |
310 | |
311 | /* |
312 | * Now both strings are aligned at 16-byte boundary. Loop over strings |
313 | * checking 32-bytes per iteration. |
314 | */ |
315 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
316 | .p2align 4 |
317 | LABEL(ashr_0_use): |
318 | movdqa (%rdi,%rdx), %xmm0 |
319 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
320 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
321 | #else |
322 | movdqa (%rsi,%rdx), %xmm1 |
323 | TOLOWER (%xmm0, %xmm1) |
324 | pcmpistri $0x1a, %xmm1, %xmm0 |
325 | #endif |
326 | lea 16(%rdx), %rdx |
327 | jbe LABEL(ashr_0_exit_use) |
328 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
329 | sub $16, %r11 |
330 | jbe LABEL(strcmp_exitz) |
331 | #endif |
332 | |
333 | movdqa (%rdi,%rdx), %xmm0 |
334 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
335 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
336 | #else |
337 | movdqa (%rsi,%rdx), %xmm1 |
338 | TOLOWER (%xmm0, %xmm1) |
339 | pcmpistri $0x1a, %xmm1, %xmm0 |
340 | #endif |
341 | lea 16(%rdx), %rdx |
342 | jbe LABEL(ashr_0_exit_use) |
343 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
344 | sub $16, %r11 |
345 | jbe LABEL(strcmp_exitz) |
346 | #endif |
347 | jmp LABEL(ashr_0_use) |
348 | |
349 | |
350 | .p2align 4 |
351 | LABEL(ashr_0_exit_use): |
352 | jnc LABEL(strcmp_exitz) |
353 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
354 | sub %rcx, %r11 |
355 | jbe LABEL(strcmp_exitz) |
356 | #endif |
357 | lea -16(%rdx, %rcx), %rcx |
358 | movzbl (%rdi, %rcx), %eax |
359 | movzbl (%rsi, %rcx), %edx |
360 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
361 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
362 | movl (%rcx,%rax,4), %eax |
363 | movl (%rcx,%rdx,4), %edx |
364 | #endif |
365 | sub %edx, %eax |
366 | ret |
367 | |
368 | |
369 | |
370 | /* |
371 | * The following cases will be handled by ashr_1 |
372 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
373 | * n(15) n -15 0(15 +(n-15) - n) ashr_1 |
374 | */ |
375 | .p2align 4 |
376 | LABEL(ashr_1): |
377 | pslldq $15, D(%xmm2) /* shift first string to align with second */ |
378 | TOLOWER (%xmm1, %xmm2) |
379 | pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ |
380 | psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ |
381 | pmovmskb %xmm2, %r9d |
382 | shr %cl, %edx /* adjust 0xffff for offset */ |
383 | shr %cl, %r9d /* adjust for 16-byte offset */ |
384 | sub %r9d, %edx |
385 | jnz LABEL(less32bytes) /* mismatch or null char seen */ |
386 | movdqa (%rdi), %xmm3 |
387 | UPDATE_STRNCMP_COUNTER |
388 | |
389 | mov $16, %rcx /* index for loads*/ |
390 | mov $1, %r9d /* byte position left over from less32bytes case */ |
391 | /* |
392 | * Setup %r10 value allows us to detect crossing a page boundary. |
393 | * When %r10 goes positive we have crossed a page boundary and |
394 | * need to do a nibble. |
395 | */ |
396 | lea 1(%rdi), %r10 |
397 | and $0xfff, %r10 /* offset into 4K page */ |
398 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
399 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
400 | |
401 | .p2align 4 |
402 | LABEL(loop_ashr_1_use): |
403 | add $16, %r10 |
404 | jg LABEL(nibble_ashr_1_use) |
405 | |
406 | LABEL(nibble_ashr_1_restart_use): |
407 | movdqa (%rdi, %rdx), %xmm0 |
408 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
409 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
410 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
411 | #else |
412 | movdqa (%rsi,%rdx), %xmm1 |
413 | TOLOWER (%xmm0, %xmm1) |
414 | pcmpistri $0x1a, %xmm1, %xmm0 |
415 | #endif |
416 | jbe LABEL(exit_use) |
417 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
418 | sub $16, %r11 |
419 | jbe LABEL(strcmp_exitz) |
420 | #endif |
421 | |
422 | add $16, %rdx |
423 | add $16, %r10 |
424 | jg LABEL(nibble_ashr_1_use) |
425 | |
426 | movdqa (%rdi, %rdx), %xmm0 |
427 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
428 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
429 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
430 | #else |
431 | movdqa (%rsi,%rdx), %xmm1 |
432 | TOLOWER (%xmm0, %xmm1) |
433 | pcmpistri $0x1a, %xmm1, %xmm0 |
434 | #endif |
435 | jbe LABEL(exit_use) |
436 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
437 | sub $16, %r11 |
438 | jbe LABEL(strcmp_exitz) |
439 | #endif |
440 | add $16, %rdx |
441 | jmp LABEL(loop_ashr_1_use) |
442 | |
443 | .p2align 4 |
444 | LABEL(nibble_ashr_1_use): |
445 | sub $0x1000, %r10 |
446 | movdqa -16(%rdi, %rdx), %xmm0 |
447 | psrldq $1, D(%xmm0) |
448 | pcmpistri $0x3a,%xmm0, %xmm0 |
449 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
450 | cmp %r11, %rcx |
451 | jae LABEL(nibble_ashr_exit_use) |
452 | #endif |
453 | cmp $14, %ecx |
454 | ja LABEL(nibble_ashr_1_restart_use) |
455 | |
456 | jmp LABEL(nibble_ashr_exit_use) |
457 | |
458 | /* |
459 | * The following cases will be handled by ashr_2 |
460 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
461 | * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 |
462 | */ |
463 | .p2align 4 |
464 | LABEL(ashr_2): |
465 | pslldq $14, D(%xmm2) |
466 | TOLOWER (%xmm1, %xmm2) |
467 | pcmpeqb %xmm1, D(%xmm2) |
468 | psubb %xmm0, D(%xmm2) |
469 | pmovmskb %xmm2, %r9d |
470 | shr %cl, %edx |
471 | shr %cl, %r9d |
472 | sub %r9d, %edx |
473 | jnz LABEL(less32bytes) |
474 | movdqa (%rdi), %xmm3 |
475 | UPDATE_STRNCMP_COUNTER |
476 | |
477 | mov $16, %rcx /* index for loads */ |
478 | mov $2, %r9d /* byte position left over from less32bytes case */ |
479 | /* |
480 | * Setup %r10 value allows us to detect crossing a page boundary. |
481 | * When %r10 goes positive we have crossed a page boundary and |
482 | * need to do a nibble. |
483 | */ |
484 | lea 2(%rdi), %r10 |
485 | and $0xfff, %r10 /* offset into 4K page */ |
486 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
487 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
488 | |
489 | .p2align 4 |
490 | LABEL(loop_ashr_2_use): |
491 | add $16, %r10 |
492 | jg LABEL(nibble_ashr_2_use) |
493 | |
494 | LABEL(nibble_ashr_2_restart_use): |
495 | movdqa (%rdi, %rdx), %xmm0 |
496 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
497 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
498 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
499 | #else |
500 | movdqa (%rsi,%rdx), %xmm1 |
501 | TOLOWER (%xmm0, %xmm1) |
502 | pcmpistri $0x1a, %xmm1, %xmm0 |
503 | #endif |
504 | jbe LABEL(exit_use) |
505 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
506 | sub $16, %r11 |
507 | jbe LABEL(strcmp_exitz) |
508 | #endif |
509 | |
510 | add $16, %rdx |
511 | add $16, %r10 |
512 | jg LABEL(nibble_ashr_2_use) |
513 | |
514 | movdqa (%rdi, %rdx), %xmm0 |
515 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
516 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
517 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
518 | #else |
519 | movdqa (%rsi,%rdx), %xmm1 |
520 | TOLOWER (%xmm0, %xmm1) |
521 | pcmpistri $0x1a, %xmm1, %xmm0 |
522 | #endif |
523 | jbe LABEL(exit_use) |
524 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
525 | sub $16, %r11 |
526 | jbe LABEL(strcmp_exitz) |
527 | #endif |
528 | add $16, %rdx |
529 | jmp LABEL(loop_ashr_2_use) |
530 | |
531 | .p2align 4 |
532 | LABEL(nibble_ashr_2_use): |
533 | sub $0x1000, %r10 |
534 | movdqa -16(%rdi, %rdx), %xmm0 |
535 | psrldq $2, D(%xmm0) |
536 | pcmpistri $0x3a,%xmm0, %xmm0 |
537 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
538 | cmp %r11, %rcx |
539 | jae LABEL(nibble_ashr_exit_use) |
540 | #endif |
541 | cmp $13, %ecx |
542 | ja LABEL(nibble_ashr_2_restart_use) |
543 | |
544 | jmp LABEL(nibble_ashr_exit_use) |
545 | |
546 | /* |
547 | * The following cases will be handled by ashr_3 |
548 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
549 | * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 |
550 | */ |
551 | .p2align 4 |
552 | LABEL(ashr_3): |
553 | pslldq $13, D(%xmm2) |
554 | TOLOWER (%xmm1, %xmm2) |
555 | pcmpeqb %xmm1, D(%xmm2) |
556 | psubb %xmm0, D(%xmm2) |
557 | pmovmskb %xmm2, %r9d |
558 | shr %cl, %edx |
559 | shr %cl, %r9d |
560 | sub %r9d, %edx |
561 | jnz LABEL(less32bytes) |
562 | movdqa (%rdi), %xmm3 |
563 | |
564 | UPDATE_STRNCMP_COUNTER |
565 | |
566 | mov $16, %rcx /* index for loads */ |
567 | mov $3, %r9d /* byte position left over from less32bytes case */ |
568 | /* |
569 | * Setup %r10 value allows us to detect crossing a page boundary. |
570 | * When %r10 goes positive we have crossed a page boundary and |
571 | * need to do a nibble. |
572 | */ |
573 | lea 3(%rdi), %r10 |
574 | and $0xfff, %r10 /* offset into 4K page */ |
575 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
576 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
577 | |
578 | LABEL(loop_ashr_3_use): |
579 | add $16, %r10 |
580 | jg LABEL(nibble_ashr_3_use) |
581 | |
582 | LABEL(nibble_ashr_3_restart_use): |
583 | movdqa (%rdi, %rdx), %xmm0 |
584 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
585 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
586 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
587 | #else |
588 | movdqa (%rsi,%rdx), %xmm1 |
589 | TOLOWER (%xmm0, %xmm1) |
590 | pcmpistri $0x1a, %xmm1, %xmm0 |
591 | #endif |
592 | jbe LABEL(exit_use) |
593 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
594 | sub $16, %r11 |
595 | jbe LABEL(strcmp_exitz) |
596 | #endif |
597 | |
598 | add $16, %rdx |
599 | add $16, %r10 |
600 | jg LABEL(nibble_ashr_3_use) |
601 | |
602 | movdqa (%rdi, %rdx), %xmm0 |
603 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
604 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
605 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
606 | #else |
607 | movdqa (%rsi,%rdx), %xmm1 |
608 | TOLOWER (%xmm0, %xmm1) |
609 | pcmpistri $0x1a, %xmm1, %xmm0 |
610 | #endif |
611 | jbe LABEL(exit_use) |
612 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
613 | sub $16, %r11 |
614 | jbe LABEL(strcmp_exitz) |
615 | #endif |
616 | add $16, %rdx |
617 | jmp LABEL(loop_ashr_3_use) |
618 | |
619 | .p2align 4 |
620 | LABEL(nibble_ashr_3_use): |
621 | sub $0x1000, %r10 |
622 | movdqa -16(%rdi, %rdx), %xmm0 |
623 | psrldq $3, D(%xmm0) |
624 | pcmpistri $0x3a,%xmm0, %xmm0 |
625 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
626 | cmp %r11, %rcx |
627 | jae LABEL(nibble_ashr_exit_use) |
628 | #endif |
629 | cmp $12, %ecx |
630 | ja LABEL(nibble_ashr_3_restart_use) |
631 | |
632 | jmp LABEL(nibble_ashr_exit_use) |
633 | |
634 | /* |
635 | * The following cases will be handled by ashr_4 |
636 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
637 | * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 |
638 | */ |
639 | .p2align 4 |
640 | LABEL(ashr_4): |
641 | pslldq $12, D(%xmm2) |
642 | TOLOWER (%xmm1, %xmm2) |
643 | pcmpeqb %xmm1, D(%xmm2) |
644 | psubb %xmm0, D(%xmm2) |
645 | pmovmskb %xmm2, %r9d |
646 | shr %cl, %edx |
647 | shr %cl, %r9d |
648 | sub %r9d, %edx |
649 | jnz LABEL(less32bytes) |
650 | movdqa (%rdi), %xmm3 |
651 | |
652 | UPDATE_STRNCMP_COUNTER |
653 | |
654 | mov $16, %rcx /* index for loads */ |
655 | mov $4, %r9d /* byte position left over from less32bytes case */ |
656 | /* |
657 | * Setup %r10 value allows us to detect crossing a page boundary. |
658 | * When %r10 goes positive we have crossed a page boundary and |
659 | * need to do a nibble. |
660 | */ |
661 | lea 4(%rdi), %r10 |
662 | and $0xfff, %r10 /* offset into 4K page */ |
663 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
664 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
665 | |
666 | .p2align 4 |
667 | LABEL(loop_ashr_4_use): |
668 | add $16, %r10 |
669 | jg LABEL(nibble_ashr_4_use) |
670 | |
671 | LABEL(nibble_ashr_4_restart_use): |
672 | movdqa (%rdi, %rdx), %xmm0 |
673 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
674 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
675 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
676 | #else |
677 | movdqa (%rsi,%rdx), %xmm1 |
678 | TOLOWER (%xmm0, %xmm1) |
679 | pcmpistri $0x1a, %xmm1, %xmm0 |
680 | #endif |
681 | jbe LABEL(exit_use) |
682 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
683 | sub $16, %r11 |
684 | jbe LABEL(strcmp_exitz) |
685 | #endif |
686 | |
687 | add $16, %rdx |
688 | add $16, %r10 |
689 | jg LABEL(nibble_ashr_4_use) |
690 | |
691 | movdqa (%rdi, %rdx), %xmm0 |
692 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
693 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
694 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
695 | #else |
696 | movdqa (%rsi,%rdx), %xmm1 |
697 | TOLOWER (%xmm0, %xmm1) |
698 | pcmpistri $0x1a, %xmm1, %xmm0 |
699 | #endif |
700 | jbe LABEL(exit_use) |
701 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
702 | sub $16, %r11 |
703 | jbe LABEL(strcmp_exitz) |
704 | #endif |
705 | add $16, %rdx |
706 | jmp LABEL(loop_ashr_4_use) |
707 | |
708 | .p2align 4 |
709 | LABEL(nibble_ashr_4_use): |
710 | sub $0x1000, %r10 |
711 | movdqa -16(%rdi, %rdx), %xmm0 |
712 | psrldq $4, D(%xmm0) |
713 | pcmpistri $0x3a,%xmm0, %xmm0 |
714 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
715 | cmp %r11, %rcx |
716 | jae LABEL(nibble_ashr_exit_use) |
717 | #endif |
718 | cmp $11, %ecx |
719 | ja LABEL(nibble_ashr_4_restart_use) |
720 | |
721 | jmp LABEL(nibble_ashr_exit_use) |
722 | |
723 | /* |
724 | * The following cases will be handled by ashr_5 |
725 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
726 | * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 |
727 | */ |
728 | .p2align 4 |
729 | LABEL(ashr_5): |
730 | pslldq $11, D(%xmm2) |
731 | TOLOWER (%xmm1, %xmm2) |
732 | pcmpeqb %xmm1, D(%xmm2) |
733 | psubb %xmm0, D(%xmm2) |
734 | pmovmskb %xmm2, %r9d |
735 | shr %cl, %edx |
736 | shr %cl, %r9d |
737 | sub %r9d, %edx |
738 | jnz LABEL(less32bytes) |
739 | movdqa (%rdi), %xmm3 |
740 | |
741 | UPDATE_STRNCMP_COUNTER |
742 | |
743 | mov $16, %rcx /* index for loads */ |
744 | mov $5, %r9d /* byte position left over from less32bytes case */ |
745 | /* |
746 | * Setup %r10 value allows us to detect crossing a page boundary. |
747 | * When %r10 goes positive we have crossed a page boundary and |
748 | * need to do a nibble. |
749 | */ |
750 | lea 5(%rdi), %r10 |
751 | and $0xfff, %r10 /* offset into 4K page */ |
752 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
753 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
754 | |
755 | .p2align 4 |
756 | LABEL(loop_ashr_5_use): |
757 | add $16, %r10 |
758 | jg LABEL(nibble_ashr_5_use) |
759 | |
760 | LABEL(nibble_ashr_5_restart_use): |
761 | movdqa (%rdi, %rdx), %xmm0 |
762 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
763 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
764 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
765 | #else |
766 | movdqa (%rsi,%rdx), %xmm1 |
767 | TOLOWER (%xmm0, %xmm1) |
768 | pcmpistri $0x1a, %xmm1, %xmm0 |
769 | #endif |
770 | jbe LABEL(exit_use) |
771 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
772 | sub $16, %r11 |
773 | jbe LABEL(strcmp_exitz) |
774 | #endif |
775 | |
776 | add $16, %rdx |
777 | add $16, %r10 |
778 | jg LABEL(nibble_ashr_5_use) |
779 | |
780 | movdqa (%rdi, %rdx), %xmm0 |
781 | |
782 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
783 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
784 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
785 | #else |
786 | movdqa (%rsi,%rdx), %xmm1 |
787 | TOLOWER (%xmm0, %xmm1) |
788 | pcmpistri $0x1a, %xmm1, %xmm0 |
789 | #endif |
790 | jbe LABEL(exit_use) |
791 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
792 | sub $16, %r11 |
793 | jbe LABEL(strcmp_exitz) |
794 | #endif |
795 | add $16, %rdx |
796 | jmp LABEL(loop_ashr_5_use) |
797 | |
798 | .p2align 4 |
799 | LABEL(nibble_ashr_5_use): |
800 | sub $0x1000, %r10 |
801 | movdqa -16(%rdi, %rdx), %xmm0 |
802 | psrldq $5, D(%xmm0) |
803 | pcmpistri $0x3a,%xmm0, %xmm0 |
804 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
805 | cmp %r11, %rcx |
806 | jae LABEL(nibble_ashr_exit_use) |
807 | #endif |
808 | cmp $10, %ecx |
809 | ja LABEL(nibble_ashr_5_restart_use) |
810 | |
811 | jmp LABEL(nibble_ashr_exit_use) |
812 | |
813 | /* |
814 | * The following cases will be handled by ashr_6 |
815 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
816 | * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 |
817 | */ |
818 | .p2align 4 |
819 | LABEL(ashr_6): |
820 | pslldq $10, D(%xmm2) |
821 | TOLOWER (%xmm1, %xmm2) |
822 | pcmpeqb %xmm1, D(%xmm2) |
823 | psubb %xmm0, D(%xmm2) |
824 | pmovmskb %xmm2, %r9d |
825 | shr %cl, %edx |
826 | shr %cl, %r9d |
827 | sub %r9d, %edx |
828 | jnz LABEL(less32bytes) |
829 | movdqa (%rdi), %xmm3 |
830 | |
831 | UPDATE_STRNCMP_COUNTER |
832 | |
833 | mov $16, %rcx /* index for loads */ |
834 | mov $6, %r9d /* byte position left over from less32bytes case */ |
835 | /* |
836 | * Setup %r10 value allows us to detect crossing a page boundary. |
837 | * When %r10 goes positive we have crossed a page boundary and |
838 | * need to do a nibble. |
839 | */ |
840 | lea 6(%rdi), %r10 |
841 | and $0xfff, %r10 /* offset into 4K page */ |
842 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
843 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
844 | |
845 | .p2align 4 |
846 | LABEL(loop_ashr_6_use): |
847 | add $16, %r10 |
848 | jg LABEL(nibble_ashr_6_use) |
849 | |
850 | LABEL(nibble_ashr_6_restart_use): |
851 | movdqa (%rdi, %rdx), %xmm0 |
852 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
853 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
854 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
855 | #else |
856 | movdqa (%rsi,%rdx), %xmm1 |
857 | TOLOWER (%xmm0, %xmm1) |
858 | pcmpistri $0x1a, %xmm1, %xmm0 |
859 | #endif |
860 | jbe LABEL(exit_use) |
861 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
862 | sub $16, %r11 |
863 | jbe LABEL(strcmp_exitz) |
864 | #endif |
865 | |
866 | add $16, %rdx |
867 | add $16, %r10 |
868 | jg LABEL(nibble_ashr_6_use) |
869 | |
870 | movdqa (%rdi, %rdx), %xmm0 |
871 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
872 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
873 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
874 | #else |
875 | movdqa (%rsi,%rdx), %xmm1 |
876 | TOLOWER (%xmm0, %xmm1) |
877 | pcmpistri $0x1a, %xmm1, %xmm0 |
878 | #endif |
879 | jbe LABEL(exit_use) |
880 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
881 | sub $16, %r11 |
882 | jbe LABEL(strcmp_exitz) |
883 | #endif |
884 | add $16, %rdx |
885 | jmp LABEL(loop_ashr_6_use) |
886 | |
887 | .p2align 4 |
888 | LABEL(nibble_ashr_6_use): |
889 | sub $0x1000, %r10 |
890 | movdqa -16(%rdi, %rdx), %xmm0 |
891 | psrldq $6, D(%xmm0) |
892 | pcmpistri $0x3a,%xmm0, %xmm0 |
893 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
894 | cmp %r11, %rcx |
895 | jae LABEL(nibble_ashr_exit_use) |
896 | #endif |
897 | cmp $9, %ecx |
898 | ja LABEL(nibble_ashr_6_restart_use) |
899 | |
900 | jmp LABEL(nibble_ashr_exit_use) |
901 | |
902 | /* |
903 | * The following cases will be handled by ashr_7 |
904 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
905 | * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 |
906 | */ |
907 | .p2align 4 |
908 | LABEL(ashr_7): |
909 | pslldq $9, D(%xmm2) |
910 | TOLOWER (%xmm1, %xmm2) |
911 | pcmpeqb %xmm1, D(%xmm2) |
912 | psubb %xmm0, D(%xmm2) |
913 | pmovmskb %xmm2, %r9d |
914 | shr %cl, %edx |
915 | shr %cl, %r9d |
916 | sub %r9d, %edx |
917 | jnz LABEL(less32bytes) |
918 | movdqa (%rdi), %xmm3 |
919 | |
920 | UPDATE_STRNCMP_COUNTER |
921 | |
922 | mov $16, %rcx /* index for loads */ |
923 | mov $7, %r9d /* byte position left over from less32bytes case */ |
924 | /* |
925 | * Setup %r10 value allows us to detect crossing a page boundary. |
926 | * When %r10 goes positive we have crossed a page boundary and |
927 | * need to do a nibble. |
928 | */ |
929 | lea 7(%rdi), %r10 |
930 | and $0xfff, %r10 /* offset into 4K page */ |
931 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
932 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
933 | |
934 | .p2align 4 |
935 | LABEL(loop_ashr_7_use): |
936 | add $16, %r10 |
937 | jg LABEL(nibble_ashr_7_use) |
938 | |
939 | LABEL(nibble_ashr_7_restart_use): |
940 | movdqa (%rdi, %rdx), %xmm0 |
941 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
942 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
943 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
944 | #else |
945 | movdqa (%rsi,%rdx), %xmm1 |
946 | TOLOWER (%xmm0, %xmm1) |
947 | pcmpistri $0x1a, %xmm1, %xmm0 |
948 | #endif |
949 | jbe LABEL(exit_use) |
950 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
951 | sub $16, %r11 |
952 | jbe LABEL(strcmp_exitz) |
953 | #endif |
954 | |
955 | add $16, %rdx |
956 | add $16, %r10 |
957 | jg LABEL(nibble_ashr_7_use) |
958 | |
959 | movdqa (%rdi, %rdx), %xmm0 |
960 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
961 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
962 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
963 | #else |
964 | movdqa (%rsi,%rdx), %xmm1 |
965 | TOLOWER (%xmm0, %xmm1) |
966 | pcmpistri $0x1a, %xmm1, %xmm0 |
967 | #endif |
968 | jbe LABEL(exit_use) |
969 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
970 | sub $16, %r11 |
971 | jbe LABEL(strcmp_exitz) |
972 | #endif |
973 | add $16, %rdx |
974 | jmp LABEL(loop_ashr_7_use) |
975 | |
976 | .p2align 4 |
977 | LABEL(nibble_ashr_7_use): |
978 | sub $0x1000, %r10 |
979 | movdqa -16(%rdi, %rdx), %xmm0 |
980 | psrldq $7, D(%xmm0) |
981 | pcmpistri $0x3a,%xmm0, %xmm0 |
982 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
983 | cmp %r11, %rcx |
984 | jae LABEL(nibble_ashr_exit_use) |
985 | #endif |
986 | cmp $8, %ecx |
987 | ja LABEL(nibble_ashr_7_restart_use) |
988 | |
989 | jmp LABEL(nibble_ashr_exit_use) |
990 | |
991 | /* |
992 | * The following cases will be handled by ashr_8 |
993 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
994 | * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 |
995 | */ |
996 | .p2align 4 |
997 | LABEL(ashr_8): |
998 | pslldq $8, D(%xmm2) |
999 | TOLOWER (%xmm1, %xmm2) |
1000 | pcmpeqb %xmm1, D(%xmm2) |
1001 | psubb %xmm0, D(%xmm2) |
1002 | pmovmskb %xmm2, %r9d |
1003 | shr %cl, %edx |
1004 | shr %cl, %r9d |
1005 | sub %r9d, %edx |
1006 | jnz LABEL(less32bytes) |
1007 | movdqa (%rdi), %xmm3 |
1008 | |
1009 | UPDATE_STRNCMP_COUNTER |
1010 | |
1011 | mov $16, %rcx /* index for loads */ |
1012 | mov $8, %r9d /* byte position left over from less32bytes case */ |
1013 | /* |
1014 | * Setup %r10 value allows us to detect crossing a page boundary. |
1015 | * When %r10 goes positive we have crossed a page boundary and |
1016 | * need to do a nibble. |
1017 | */ |
1018 | lea 8(%rdi), %r10 |
1019 | and $0xfff, %r10 /* offset into 4K page */ |
1020 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1021 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1022 | |
1023 | .p2align 4 |
1024 | LABEL(loop_ashr_8_use): |
1025 | add $16, %r10 |
1026 | jg LABEL(nibble_ashr_8_use) |
1027 | |
1028 | LABEL(nibble_ashr_8_restart_use): |
1029 | movdqa (%rdi, %rdx), %xmm0 |
1030 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
1031 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1032 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1033 | #else |
1034 | movdqa (%rsi,%rdx), %xmm1 |
1035 | TOLOWER (%xmm0, %xmm1) |
1036 | pcmpistri $0x1a, %xmm1, %xmm0 |
1037 | #endif |
1038 | jbe LABEL(exit_use) |
1039 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1040 | sub $16, %r11 |
1041 | jbe LABEL(strcmp_exitz) |
1042 | #endif |
1043 | |
1044 | add $16, %rdx |
1045 | add $16, %r10 |
1046 | jg LABEL(nibble_ashr_8_use) |
1047 | |
1048 | movdqa (%rdi, %rdx), %xmm0 |
1049 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
1050 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1051 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1052 | #else |
1053 | movdqa (%rsi,%rdx), %xmm1 |
1054 | TOLOWER (%xmm0, %xmm1) |
1055 | pcmpistri $0x1a, %xmm1, %xmm0 |
1056 | #endif |
1057 | jbe LABEL(exit_use) |
1058 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1059 | sub $16, %r11 |
1060 | jbe LABEL(strcmp_exitz) |
1061 | #endif |
1062 | add $16, %rdx |
1063 | jmp LABEL(loop_ashr_8_use) |
1064 | |
1065 | .p2align 4 |
1066 | LABEL(nibble_ashr_8_use): |
1067 | sub $0x1000, %r10 |
1068 | movdqa -16(%rdi, %rdx), %xmm0 |
1069 | psrldq $8, D(%xmm0) |
1070 | pcmpistri $0x3a,%xmm0, %xmm0 |
1071 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1072 | cmp %r11, %rcx |
1073 | jae LABEL(nibble_ashr_exit_use) |
1074 | #endif |
1075 | cmp $7, %ecx |
1076 | ja LABEL(nibble_ashr_8_restart_use) |
1077 | |
1078 | jmp LABEL(nibble_ashr_exit_use) |
1079 | |
1080 | /* |
1081 | * The following cases will be handled by ashr_9 |
1082 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1083 | * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 |
1084 | */ |
1085 | .p2align 4 |
1086 | LABEL(ashr_9): |
1087 | pslldq $7, D(%xmm2) |
1088 | TOLOWER (%xmm1, %xmm2) |
1089 | pcmpeqb %xmm1, D(%xmm2) |
1090 | psubb %xmm0, D(%xmm2) |
1091 | pmovmskb %xmm2, %r9d |
1092 | shr %cl, %edx |
1093 | shr %cl, %r9d |
1094 | sub %r9d, %edx |
1095 | jnz LABEL(less32bytes) |
1096 | movdqa (%rdi), %xmm3 |
1097 | |
1098 | UPDATE_STRNCMP_COUNTER |
1099 | |
1100 | mov $16, %rcx /* index for loads */ |
1101 | mov $9, %r9d /* byte position left over from less32bytes case */ |
1102 | /* |
1103 | * Setup %r10 value allows us to detect crossing a page boundary. |
1104 | * When %r10 goes positive we have crossed a page boundary and |
1105 | * need to do a nibble. |
1106 | */ |
1107 | lea 9(%rdi), %r10 |
1108 | and $0xfff, %r10 /* offset into 4K page */ |
1109 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1110 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1111 | |
1112 | .p2align 4 |
1113 | LABEL(loop_ashr_9_use): |
1114 | add $16, %r10 |
1115 | jg LABEL(nibble_ashr_9_use) |
1116 | |
1117 | LABEL(nibble_ashr_9_restart_use): |
1118 | movdqa (%rdi, %rdx), %xmm0 |
1119 | |
1120 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
1121 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1122 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1123 | #else |
1124 | movdqa (%rsi,%rdx), %xmm1 |
1125 | TOLOWER (%xmm0, %xmm1) |
1126 | pcmpistri $0x1a, %xmm1, %xmm0 |
1127 | #endif |
1128 | jbe LABEL(exit_use) |
1129 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1130 | sub $16, %r11 |
1131 | jbe LABEL(strcmp_exitz) |
1132 | #endif |
1133 | |
1134 | add $16, %rdx |
1135 | add $16, %r10 |
1136 | jg LABEL(nibble_ashr_9_use) |
1137 | |
1138 | movdqa (%rdi, %rdx), %xmm0 |
1139 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
1140 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1141 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1142 | #else |
1143 | movdqa (%rsi,%rdx), %xmm1 |
1144 | TOLOWER (%xmm0, %xmm1) |
1145 | pcmpistri $0x1a, %xmm1, %xmm0 |
1146 | #endif |
1147 | jbe LABEL(exit_use) |
1148 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1149 | sub $16, %r11 |
1150 | jbe LABEL(strcmp_exitz) |
1151 | #endif |
1152 | add $16, %rdx |
1153 | jmp LABEL(loop_ashr_9_use) |
1154 | |
1155 | .p2align 4 |
1156 | LABEL(nibble_ashr_9_use): |
1157 | sub $0x1000, %r10 |
1158 | movdqa -16(%rdi, %rdx), %xmm0 |
1159 | psrldq $9, D(%xmm0) |
1160 | pcmpistri $0x3a,%xmm0, %xmm0 |
1161 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1162 | cmp %r11, %rcx |
1163 | jae LABEL(nibble_ashr_exit_use) |
1164 | #endif |
1165 | cmp $6, %ecx |
1166 | ja LABEL(nibble_ashr_9_restart_use) |
1167 | |
1168 | jmp LABEL(nibble_ashr_exit_use) |
1169 | |
1170 | /* |
1171 | * The following cases will be handled by ashr_10 |
1172 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1173 | * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 |
1174 | */ |
1175 | .p2align 4 |
1176 | LABEL(ashr_10): |
1177 | pslldq $6, D(%xmm2) |
1178 | TOLOWER (%xmm1, %xmm2) |
1179 | pcmpeqb %xmm1, D(%xmm2) |
1180 | psubb %xmm0, D(%xmm2) |
1181 | pmovmskb %xmm2, %r9d |
1182 | shr %cl, %edx |
1183 | shr %cl, %r9d |
1184 | sub %r9d, %edx |
1185 | jnz LABEL(less32bytes) |
1186 | movdqa (%rdi), %xmm3 |
1187 | |
1188 | UPDATE_STRNCMP_COUNTER |
1189 | |
1190 | mov $16, %rcx /* index for loads */ |
1191 | mov $10, %r9d /* byte position left over from less32bytes case */ |
1192 | /* |
1193 | * Setup %r10 value allows us to detect crossing a page boundary. |
1194 | * When %r10 goes positive we have crossed a page boundary and |
1195 | * need to do a nibble. |
1196 | */ |
1197 | lea 10(%rdi), %r10 |
1198 | and $0xfff, %r10 /* offset into 4K page */ |
1199 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1200 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1201 | |
1202 | .p2align 4 |
1203 | LABEL(loop_ashr_10_use): |
1204 | add $16, %r10 |
1205 | jg LABEL(nibble_ashr_10_use) |
1206 | |
1207 | LABEL(nibble_ashr_10_restart_use): |
1208 | movdqa (%rdi, %rdx), %xmm0 |
1209 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
1210 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1211 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1212 | #else |
1213 | movdqa (%rsi,%rdx), %xmm1 |
1214 | TOLOWER (%xmm0, %xmm1) |
1215 | pcmpistri $0x1a, %xmm1, %xmm0 |
1216 | #endif |
1217 | jbe LABEL(exit_use) |
1218 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1219 | sub $16, %r11 |
1220 | jbe LABEL(strcmp_exitz) |
1221 | #endif |
1222 | |
1223 | add $16, %rdx |
1224 | add $16, %r10 |
1225 | jg LABEL(nibble_ashr_10_use) |
1226 | |
1227 | movdqa (%rdi, %rdx), %xmm0 |
1228 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
1229 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1230 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1231 | #else |
1232 | movdqa (%rsi,%rdx), %xmm1 |
1233 | TOLOWER (%xmm0, %xmm1) |
1234 | pcmpistri $0x1a, %xmm1, %xmm0 |
1235 | #endif |
1236 | jbe LABEL(exit_use) |
1237 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1238 | sub $16, %r11 |
1239 | jbe LABEL(strcmp_exitz) |
1240 | #endif |
1241 | add $16, %rdx |
1242 | jmp LABEL(loop_ashr_10_use) |
1243 | |
1244 | .p2align 4 |
1245 | LABEL(nibble_ashr_10_use): |
1246 | sub $0x1000, %r10 |
1247 | movdqa -16(%rdi, %rdx), %xmm0 |
1248 | psrldq $10, D(%xmm0) |
1249 | pcmpistri $0x3a,%xmm0, %xmm0 |
1250 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1251 | cmp %r11, %rcx |
1252 | jae LABEL(nibble_ashr_exit_use) |
1253 | #endif |
1254 | cmp $5, %ecx |
1255 | ja LABEL(nibble_ashr_10_restart_use) |
1256 | |
1257 | jmp LABEL(nibble_ashr_exit_use) |
1258 | |
1259 | /* |
1260 | * The following cases will be handled by ashr_11 |
1261 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1262 | * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 |
1263 | */ |
1264 | .p2align 4 |
1265 | LABEL(ashr_11): |
1266 | pslldq $5, D(%xmm2) |
1267 | TOLOWER (%xmm1, %xmm2) |
1268 | pcmpeqb %xmm1, D(%xmm2) |
1269 | psubb %xmm0, D(%xmm2) |
1270 | pmovmskb %xmm2, %r9d |
1271 | shr %cl, %edx |
1272 | shr %cl, %r9d |
1273 | sub %r9d, %edx |
1274 | jnz LABEL(less32bytes) |
1275 | movdqa (%rdi), %xmm3 |
1276 | |
1277 | UPDATE_STRNCMP_COUNTER |
1278 | |
1279 | mov $16, %rcx /* index for loads */ |
1280 | mov $11, %r9d /* byte position left over from less32bytes case */ |
1281 | /* |
1282 | * Setup %r10 value allows us to detect crossing a page boundary. |
1283 | * When %r10 goes positive we have crossed a page boundary and |
1284 | * need to do a nibble. |
1285 | */ |
1286 | lea 11(%rdi), %r10 |
1287 | and $0xfff, %r10 /* offset into 4K page */ |
1288 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1289 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1290 | |
1291 | .p2align 4 |
1292 | LABEL(loop_ashr_11_use): |
1293 | add $16, %r10 |
1294 | jg LABEL(nibble_ashr_11_use) |
1295 | |
1296 | LABEL(nibble_ashr_11_restart_use): |
1297 | movdqa (%rdi, %rdx), %xmm0 |
1298 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
1299 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1300 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1301 | #else |
1302 | movdqa (%rsi,%rdx), %xmm1 |
1303 | TOLOWER (%xmm0, %xmm1) |
1304 | pcmpistri $0x1a, %xmm1, %xmm0 |
1305 | #endif |
1306 | jbe LABEL(exit_use) |
1307 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1308 | sub $16, %r11 |
1309 | jbe LABEL(strcmp_exitz) |
1310 | #endif |
1311 | |
1312 | add $16, %rdx |
1313 | add $16, %r10 |
1314 | jg LABEL(nibble_ashr_11_use) |
1315 | |
1316 | movdqa (%rdi, %rdx), %xmm0 |
1317 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
1318 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1319 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1320 | #else |
1321 | movdqa (%rsi,%rdx), %xmm1 |
1322 | TOLOWER (%xmm0, %xmm1) |
1323 | pcmpistri $0x1a, %xmm1, %xmm0 |
1324 | #endif |
1325 | jbe LABEL(exit_use) |
1326 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1327 | sub $16, %r11 |
1328 | jbe LABEL(strcmp_exitz) |
1329 | #endif |
1330 | add $16, %rdx |
1331 | jmp LABEL(loop_ashr_11_use) |
1332 | |
1333 | .p2align 4 |
1334 | LABEL(nibble_ashr_11_use): |
1335 | sub $0x1000, %r10 |
1336 | movdqa -16(%rdi, %rdx), %xmm0 |
1337 | psrldq $11, D(%xmm0) |
1338 | pcmpistri $0x3a,%xmm0, %xmm0 |
1339 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1340 | cmp %r11, %rcx |
1341 | jae LABEL(nibble_ashr_exit_use) |
1342 | #endif |
1343 | cmp $4, %ecx |
1344 | ja LABEL(nibble_ashr_11_restart_use) |
1345 | |
1346 | jmp LABEL(nibble_ashr_exit_use) |
1347 | |
1348 | /* |
1349 | * The following cases will be handled by ashr_12 |
1350 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1351 | * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 |
1352 | */ |
1353 | .p2align 4 |
1354 | LABEL(ashr_12): |
1355 | pslldq $4, D(%xmm2) |
1356 | TOLOWER (%xmm1, %xmm2) |
1357 | pcmpeqb %xmm1, D(%xmm2) |
1358 | psubb %xmm0, D(%xmm2) |
1359 | pmovmskb %xmm2, %r9d |
1360 | shr %cl, %edx |
1361 | shr %cl, %r9d |
1362 | sub %r9d, %edx |
1363 | jnz LABEL(less32bytes) |
1364 | movdqa (%rdi), %xmm3 |
1365 | |
1366 | UPDATE_STRNCMP_COUNTER |
1367 | |
1368 | mov $16, %rcx /* index for loads */ |
1369 | mov $12, %r9d /* byte position left over from less32bytes case */ |
1370 | /* |
1371 | * Setup %r10 value allows us to detect crossing a page boundary. |
1372 | * When %r10 goes positive we have crossed a page boundary and |
1373 | * need to do a nibble. |
1374 | */ |
1375 | lea 12(%rdi), %r10 |
1376 | and $0xfff, %r10 /* offset into 4K page */ |
1377 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1378 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1379 | |
1380 | .p2align 4 |
1381 | LABEL(loop_ashr_12_use): |
1382 | add $16, %r10 |
1383 | jg LABEL(nibble_ashr_12_use) |
1384 | |
1385 | LABEL(nibble_ashr_12_restart_use): |
1386 | movdqa (%rdi, %rdx), %xmm0 |
1387 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
1388 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1389 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1390 | #else |
1391 | movdqa (%rsi,%rdx), %xmm1 |
1392 | TOLOWER (%xmm0, %xmm1) |
1393 | pcmpistri $0x1a, %xmm1, %xmm0 |
1394 | #endif |
1395 | jbe LABEL(exit_use) |
1396 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1397 | sub $16, %r11 |
1398 | jbe LABEL(strcmp_exitz) |
1399 | #endif |
1400 | |
1401 | add $16, %rdx |
1402 | add $16, %r10 |
1403 | jg LABEL(nibble_ashr_12_use) |
1404 | |
1405 | movdqa (%rdi, %rdx), %xmm0 |
1406 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
1407 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1408 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1409 | #else |
1410 | movdqa (%rsi,%rdx), %xmm1 |
1411 | TOLOWER (%xmm0, %xmm1) |
1412 | pcmpistri $0x1a, %xmm1, %xmm0 |
1413 | #endif |
1414 | jbe LABEL(exit_use) |
1415 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1416 | sub $16, %r11 |
1417 | jbe LABEL(strcmp_exitz) |
1418 | #endif |
1419 | add $16, %rdx |
1420 | jmp LABEL(loop_ashr_12_use) |
1421 | |
1422 | .p2align 4 |
1423 | LABEL(nibble_ashr_12_use): |
1424 | sub $0x1000, %r10 |
1425 | movdqa -16(%rdi, %rdx), %xmm0 |
1426 | psrldq $12, D(%xmm0) |
1427 | pcmpistri $0x3a,%xmm0, %xmm0 |
1428 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1429 | cmp %r11, %rcx |
1430 | jae LABEL(nibble_ashr_exit_use) |
1431 | #endif |
1432 | cmp $3, %ecx |
1433 | ja LABEL(nibble_ashr_12_restart_use) |
1434 | |
1435 | jmp LABEL(nibble_ashr_exit_use) |
1436 | |
1437 | /* |
1438 | * The following cases will be handled by ashr_13 |
1439 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1440 | * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 |
1441 | */ |
1442 | .p2align 4 |
1443 | LABEL(ashr_13): |
1444 | pslldq $3, D(%xmm2) |
1445 | TOLOWER (%xmm1, %xmm2) |
1446 | pcmpeqb %xmm1, D(%xmm2) |
1447 | psubb %xmm0, D(%xmm2) |
1448 | pmovmskb %xmm2, %r9d |
1449 | shr %cl, %edx |
1450 | shr %cl, %r9d |
1451 | sub %r9d, %edx |
1452 | jnz LABEL(less32bytes) |
1453 | movdqa (%rdi), %xmm3 |
1454 | |
1455 | UPDATE_STRNCMP_COUNTER |
1456 | |
1457 | mov $16, %rcx /* index for loads */ |
1458 | mov $13, %r9d /* byte position left over from less32bytes case */ |
1459 | /* |
1460 | * Setup %r10 value allows us to detect crossing a page boundary. |
1461 | * When %r10 goes positive we have crossed a page boundary and |
1462 | * need to do a nibble. |
1463 | */ |
1464 | lea 13(%rdi), %r10 |
1465 | and $0xfff, %r10 /* offset into 4K page */ |
1466 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1467 | |
1468 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1469 | |
1470 | .p2align 4 |
1471 | LABEL(loop_ashr_13_use): |
1472 | add $16, %r10 |
1473 | jg LABEL(nibble_ashr_13_use) |
1474 | |
1475 | LABEL(nibble_ashr_13_restart_use): |
1476 | movdqa (%rdi, %rdx), %xmm0 |
1477 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
1478 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1479 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1480 | #else |
1481 | movdqa (%rsi,%rdx), %xmm1 |
1482 | TOLOWER (%xmm0, %xmm1) |
1483 | pcmpistri $0x1a, %xmm1, %xmm0 |
1484 | #endif |
1485 | jbe LABEL(exit_use) |
1486 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1487 | sub $16, %r11 |
1488 | jbe LABEL(strcmp_exitz) |
1489 | #endif |
1490 | |
1491 | add $16, %rdx |
1492 | add $16, %r10 |
1493 | jg LABEL(nibble_ashr_13_use) |
1494 | |
1495 | movdqa (%rdi, %rdx), %xmm0 |
1496 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
1497 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1498 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1499 | #else |
1500 | movdqa (%rsi,%rdx), %xmm1 |
1501 | TOLOWER (%xmm0, %xmm1) |
1502 | pcmpistri $0x1a, %xmm1, %xmm0 |
1503 | #endif |
1504 | jbe LABEL(exit_use) |
1505 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1506 | sub $16, %r11 |
1507 | jbe LABEL(strcmp_exitz) |
1508 | #endif |
1509 | add $16, %rdx |
1510 | jmp LABEL(loop_ashr_13_use) |
1511 | |
1512 | .p2align 4 |
1513 | LABEL(nibble_ashr_13_use): |
1514 | sub $0x1000, %r10 |
1515 | movdqa -16(%rdi, %rdx), %xmm0 |
1516 | psrldq $13, D(%xmm0) |
1517 | pcmpistri $0x3a,%xmm0, %xmm0 |
1518 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1519 | cmp %r11, %rcx |
1520 | jae LABEL(nibble_ashr_exit_use) |
1521 | #endif |
1522 | cmp $2, %ecx |
1523 | ja LABEL(nibble_ashr_13_restart_use) |
1524 | |
1525 | jmp LABEL(nibble_ashr_exit_use) |
1526 | |
1527 | /* |
1528 | * The following cases will be handled by ashr_14 |
1529 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1530 | * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 |
1531 | */ |
1532 | .p2align 4 |
1533 | LABEL(ashr_14): |
1534 | pslldq $2, D(%xmm2) |
1535 | TOLOWER (%xmm1, %xmm2) |
1536 | pcmpeqb %xmm1, D(%xmm2) |
1537 | psubb %xmm0, D(%xmm2) |
1538 | pmovmskb %xmm2, %r9d |
1539 | shr %cl, %edx |
1540 | shr %cl, %r9d |
1541 | sub %r9d, %edx |
1542 | jnz LABEL(less32bytes) |
1543 | movdqa (%rdi), %xmm3 |
1544 | |
1545 | UPDATE_STRNCMP_COUNTER |
1546 | |
1547 | mov $16, %rcx /* index for loads */ |
1548 | mov $14, %r9d /* byte position left over from less32bytes case */ |
1549 | /* |
1550 | * Setup %r10 value allows us to detect crossing a page boundary. |
1551 | * When %r10 goes positive we have crossed a page boundary and |
1552 | * need to do a nibble. |
1553 | */ |
1554 | lea 14(%rdi), %r10 |
1555 | and $0xfff, %r10 /* offset into 4K page */ |
1556 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1557 | |
1558 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1559 | |
1560 | .p2align 4 |
1561 | LABEL(loop_ashr_14_use): |
1562 | add $16, %r10 |
1563 | jg LABEL(nibble_ashr_14_use) |
1564 | |
1565 | LABEL(nibble_ashr_14_restart_use): |
1566 | movdqa (%rdi, %rdx), %xmm0 |
1567 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
1568 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1569 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1570 | #else |
1571 | movdqa (%rsi,%rdx), %xmm1 |
1572 | TOLOWER (%xmm0, %xmm1) |
1573 | pcmpistri $0x1a, %xmm1, %xmm0 |
1574 | #endif |
1575 | jbe LABEL(exit_use) |
1576 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1577 | sub $16, %r11 |
1578 | jbe LABEL(strcmp_exitz) |
1579 | #endif |
1580 | |
1581 | add $16, %rdx |
1582 | add $16, %r10 |
1583 | jg LABEL(nibble_ashr_14_use) |
1584 | |
1585 | movdqa (%rdi, %rdx), %xmm0 |
1586 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
1587 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1588 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1589 | #else |
1590 | movdqa (%rsi,%rdx), %xmm1 |
1591 | TOLOWER (%xmm0, %xmm1) |
1592 | pcmpistri $0x1a, %xmm1, %xmm0 |
1593 | #endif |
1594 | jbe LABEL(exit_use) |
1595 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1596 | sub $16, %r11 |
1597 | jbe LABEL(strcmp_exitz) |
1598 | #endif |
1599 | add $16, %rdx |
1600 | jmp LABEL(loop_ashr_14_use) |
1601 | |
1602 | .p2align 4 |
1603 | LABEL(nibble_ashr_14_use): |
1604 | sub $0x1000, %r10 |
1605 | movdqa -16(%rdi, %rdx), %xmm0 |
1606 | psrldq $14, D(%xmm0) |
1607 | pcmpistri $0x3a,%xmm0, %xmm0 |
1608 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1609 | cmp %r11, %rcx |
1610 | jae LABEL(nibble_ashr_exit_use) |
1611 | #endif |
1612 | cmp $1, %ecx |
1613 | ja LABEL(nibble_ashr_14_restart_use) |
1614 | |
1615 | jmp LABEL(nibble_ashr_exit_use) |
1616 | |
1617 | /* |
1618 | * The following cases will be handled by ashr_15 |
1619 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1620 | * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 |
1621 | */ |
1622 | .p2align 4 |
1623 | LABEL(ashr_15): |
1624 | pslldq $1, D(%xmm2) |
1625 | TOLOWER (%xmm1, %xmm2) |
1626 | pcmpeqb %xmm1, D(%xmm2) |
1627 | psubb %xmm0, D(%xmm2) |
1628 | pmovmskb %xmm2, %r9d |
1629 | shr %cl, %edx |
1630 | shr %cl, %r9d |
1631 | sub %r9d, %edx |
1632 | jnz LABEL(less32bytes) |
1633 | |
1634 | movdqa (%rdi), %xmm3 |
1635 | |
1636 | UPDATE_STRNCMP_COUNTER |
1637 | |
1638 | mov $16, %rcx /* index for loads */ |
1639 | mov $15, %r9d /* byte position left over from less32bytes case */ |
1640 | /* |
1641 | * Setup %r10 value allows us to detect crossing a page boundary. |
1642 | * When %r10 goes positive we have crossed a page boundary and |
1643 | * need to do a nibble. |
1644 | */ |
1645 | lea 15(%rdi), %r10 |
1646 | and $0xfff, %r10 /* offset into 4K page */ |
1647 | |
1648 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1649 | |
1650 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1651 | |
1652 | .p2align 4 |
1653 | LABEL(loop_ashr_15_use): |
1654 | add $16, %r10 |
1655 | jg LABEL(nibble_ashr_15_use) |
1656 | |
1657 | LABEL(nibble_ashr_15_restart_use): |
1658 | movdqa (%rdi, %rdx), %xmm0 |
1659 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
1660 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1661 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1662 | #else |
1663 | movdqa (%rsi,%rdx), %xmm1 |
1664 | TOLOWER (%xmm0, %xmm1) |
1665 | pcmpistri $0x1a, %xmm1, %xmm0 |
1666 | #endif |
1667 | jbe LABEL(exit_use) |
1668 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1669 | sub $16, %r11 |
1670 | jbe LABEL(strcmp_exitz) |
1671 | #endif |
1672 | |
1673 | add $16, %rdx |
1674 | add $16, %r10 |
1675 | jg LABEL(nibble_ashr_15_use) |
1676 | |
1677 | movdqa (%rdi, %rdx), %xmm0 |
1678 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
1679 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1680 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1681 | #else |
1682 | movdqa (%rsi,%rdx), %xmm1 |
1683 | TOLOWER (%xmm0, %xmm1) |
1684 | pcmpistri $0x1a, %xmm1, %xmm0 |
1685 | #endif |
1686 | jbe LABEL(exit_use) |
1687 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1688 | sub $16, %r11 |
1689 | jbe LABEL(strcmp_exitz) |
1690 | #endif |
1691 | add $16, %rdx |
1692 | jmp LABEL(loop_ashr_15_use) |
1693 | |
1694 | .p2align 4 |
1695 | LABEL(nibble_ashr_15_use): |
1696 | sub $0x1000, %r10 |
1697 | movdqa -16(%rdi, %rdx), %xmm0 |
1698 | psrldq $15, D(%xmm0) |
1699 | pcmpistri $0x3a,%xmm0, %xmm0 |
1700 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1701 | cmp %r11, %rcx |
1702 | jae LABEL(nibble_ashr_exit_use) |
1703 | #endif |
1704 | cmp $0, %ecx |
1705 | ja LABEL(nibble_ashr_15_restart_use) |
1706 | |
1707 | LABEL(nibble_ashr_exit_use): |
1708 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1709 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
1710 | #else |
1711 | movdqa (%rsi,%rdx), %xmm1 |
1712 | TOLOWER (%xmm0, %xmm1) |
1713 | pcmpistri $0x1a, %xmm1, %xmm0 |
1714 | #endif |
1715 | .p2align 4 |
1716 | LABEL(exit_use): |
1717 | jnc LABEL(strcmp_exitz) |
1718 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1719 | sub %rcx, %r11 |
1720 | jbe LABEL(strcmp_exitz) |
1721 | #endif |
1722 | add %rcx, %rdx |
1723 | lea -16(%rdi, %r9), %rdi |
1724 | movzbl (%rdi, %rdx), %eax |
1725 | movzbl (%rsi, %rdx), %edx |
1726 | test %r8d, %r8d |
1727 | jz LABEL(ret_use) |
1728 | xchg %eax, %edx |
1729 | LABEL(ret_use): |
1730 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1731 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
1732 | movl (%rcx,%rdx,4), %edx |
1733 | movl (%rcx,%rax,4), %eax |
1734 | #endif |
1735 | |
1736 | sub %edx, %eax |
1737 | ret |
1738 | |
1739 | LABEL(less32bytes): |
1740 | lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ |
1741 | lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ |
1742 | test %r8d, %r8d |
1743 | jz LABEL(ret) |
1744 | xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ |
1745 | |
1746 | .p2align 4 |
1747 | LABEL(ret): |
1748 | LABEL(less16bytes): |
1749 | bsf %rdx, %rdx /* find and store bit index in %rdx */ |
1750 | |
1751 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1752 | sub %rdx, %r11 |
1753 | jbe LABEL(strcmp_exitz) |
1754 | #endif |
1755 | movzbl (%rsi, %rdx), %ecx |
1756 | movzbl (%rdi, %rdx), %eax |
1757 | |
1758 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1759 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1760 | movl (%rdx,%rcx,4), %ecx |
1761 | movl (%rdx,%rax,4), %eax |
1762 | #endif |
1763 | |
1764 | sub %ecx, %eax |
1765 | ret |
1766 | |
1767 | LABEL(strcmp_exitz): |
1768 | xor %eax, %eax |
1769 | ret |
1770 | |
1771 | .p2align 4 |
1772 | // XXX Same as code above |
1773 | LABEL(Byte0): |
1774 | movzx (%rsi), %ecx |
1775 | movzx (%rdi), %eax |
1776 | |
1777 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1778 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1779 | movl (%rdx,%rcx,4), %ecx |
1780 | movl (%rdx,%rax,4), %eax |
1781 | #endif |
1782 | |
1783 | sub %ecx, %eax |
1784 | ret |
1785 | cfi_endproc |
1786 | .size STRCMP_SSE42, .-STRCMP_SSE42 |
1787 | |
1788 | #undef UCLOW_reg |
1789 | #undef UCHIGH_reg |
1790 | #undef LCQWORD_reg |
1791 | #undef TOLOWER |
1792 | |
1793 | /* Put all SSE 4.2 functions together. */ |
1794 | .section .rodata.SECTION,"a" ,@progbits |
1795 | .p2align 3 |
1796 | LABEL(unaligned_table): |
1797 | .int LABEL(ashr_1) - LABEL(unaligned_table) |
1798 | .int LABEL(ashr_2) - LABEL(unaligned_table) |
1799 | .int LABEL(ashr_3) - LABEL(unaligned_table) |
1800 | .int LABEL(ashr_4) - LABEL(unaligned_table) |
1801 | .int LABEL(ashr_5) - LABEL(unaligned_table) |
1802 | .int LABEL(ashr_6) - LABEL(unaligned_table) |
1803 | .int LABEL(ashr_7) - LABEL(unaligned_table) |
1804 | .int LABEL(ashr_8) - LABEL(unaligned_table) |
1805 | .int LABEL(ashr_9) - LABEL(unaligned_table) |
1806 | .int LABEL(ashr_10) - LABEL(unaligned_table) |
1807 | .int LABEL(ashr_11) - LABEL(unaligned_table) |
1808 | .int LABEL(ashr_12) - LABEL(unaligned_table) |
1809 | .int LABEL(ashr_13) - LABEL(unaligned_table) |
1810 | .int LABEL(ashr_14) - LABEL(unaligned_table) |
1811 | .int LABEL(ashr_15) - LABEL(unaligned_table) |
1812 | .int LABEL(ashr_0) - LABEL(unaligned_table) |
1813 | |
1814 | #undef LABEL |
1815 | #undef GLABEL |
1816 | #undef SECTION |
1817 | #undef movdqa |
1818 | #undef movdqu |
1819 | #undef pmovmskb |
1820 | #undef pcmpistri |
1821 | #undef psubb |
1822 | #undef pcmpeqb |
1823 | #undef psrldq |
1824 | #undef pslldq |
1825 | #undef palignr |
1826 | #undef pxor |
1827 | #undef D |
1828 | |