1 | /* strcmp with SSE4.2 |
2 | Copyright (C) 2009-2017 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ |
19 | |
20 | |
21 | /* We use 0x1a: |
22 | _SIDD_SBYTE_OPS |
23 | | _SIDD_CMP_EQUAL_EACH |
24 | | _SIDD_NEGATIVE_POLARITY |
25 | | _SIDD_LEAST_SIGNIFICANT |
26 | on pcmpistri to find out if two 16byte data elements are the same |
27 | and the offset of the first different byte. There are 4 cases: |
28 | |
29 | 1. Both 16byte data elements are valid and identical. |
30 | 2. Both 16byte data elements have EOS and identical. |
31 | 3. Both 16byte data elements are valid and they differ at offset X. |
32 | 4. At least one 16byte data element has EOS at offset X. Two 16byte |
33 | data elements must differ at or before offset X. |
34 | |
35 | Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: |
36 | |
37 | case ECX CFlag ZFlag SFlag |
38 | 1 16 0 0 0 |
39 | 2 16 0 1 1 |
40 | 3 X 1 0 0 |
41 | 4 0 <= X 1 0/1 0/1 |
42 | |
43 | We exit from the loop for cases 2, 3 and 4 with jbe which branches |
44 | when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for |
45 | case 2. */ |
46 | |
47 | /* Put all SSE 4.2 functions together. */ |
48 | .section .text.SECTION,"ax" ,@progbits |
49 | .align 16 |
50 | .type STRCMP_SSE42, @function |
51 | .globl STRCMP_SSE42 |
52 | .hidden STRCMP_SSE42 |
53 | #ifdef USE_AS_STRCASECMP_L |
54 | ENTRY (GLABEL(__strcasecmp)) |
55 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
56 | mov %fs:(%rax),%RDX_LP |
57 | |
58 | // XXX 5 byte should be before the function |
59 | /* 5-byte NOP. */ |
60 | .byte 0x0f,0x1f,0x44,0x00,0x00 |
61 | END (GLABEL(__strcasecmp)) |
62 | /* FALLTHROUGH to strcasecmp_l. */ |
63 | #endif |
64 | #ifdef USE_AS_STRNCASECMP_L |
65 | ENTRY (GLABEL(__strncasecmp)) |
66 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
67 | mov %fs:(%rax),%RCX_LP |
68 | |
69 | // XXX 5 byte should be before the function |
70 | /* 5-byte NOP. */ |
71 | .byte 0x0f,0x1f,0x44,0x00,0x00 |
72 | END (GLABEL(__strncasecmp)) |
73 | /* FALLTHROUGH to strncasecmp_l. */ |
74 | #endif |
75 | |
76 | |
77 | #ifdef USE_AVX |
78 | # define movdqa vmovdqa |
79 | # define movdqu vmovdqu |
80 | # define pmovmskb vpmovmskb |
81 | # define pcmpistri vpcmpistri |
82 | # define psubb vpsubb |
83 | # define pcmpeqb vpcmpeqb |
84 | # define psrldq vpsrldq |
85 | # define pslldq vpslldq |
86 | # define palignr vpalignr |
87 | # define pxor vpxor |
88 | # define D(arg) arg, arg |
89 | #else |
90 | # define D(arg) arg |
91 | #endif |
92 | |
93 | STRCMP_SSE42: |
94 | cfi_startproc |
95 | CALL_MCOUNT |
96 | |
97 | /* |
98 | * This implementation uses SSE to compare up to 16 bytes at a time. |
99 | */ |
100 | #ifdef USE_AS_STRCASECMP_L |
101 | /* We have to fall back on the C implementation for locales |
102 | with encodings not matching ASCII for single bytes. */ |
103 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
104 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP |
105 | # else |
106 | mov (%rdx), %RAX_LP |
107 | # endif |
108 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
109 | jne __strcasecmp_l_nonascii |
110 | #endif |
111 | #ifdef USE_AS_STRNCASECMP_L |
112 | /* We have to fall back on the C implementation for locales |
113 | with encodings not matching ASCII for single bytes. */ |
114 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
115 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP |
116 | # else |
117 | mov (%rcx), %RAX_LP |
118 | # endif |
119 | testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
120 | jne __strncasecmp_l_nonascii |
121 | #endif |
122 | |
123 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
124 | test %rdx, %rdx |
125 | je LABEL(strcmp_exitz) |
126 | cmp $1, %rdx |
127 | je LABEL(Byte0) |
128 | mov %rdx, %r11 |
129 | #endif |
130 | mov %esi, %ecx |
131 | mov %edi, %eax |
132 | /* Use 64bit AND here to avoid long NOP padding. */ |
133 | and $0x3f, %rcx /* rsi alignment in cache line */ |
134 | and $0x3f, %rax /* rdi alignment in cache line */ |
135 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
136 | .section .rodata.cst16,"aM" ,@progbits,16 |
137 | .align 16 |
138 | LABEL(belowupper): |
139 | .quad 0x4040404040404040 |
140 | .quad 0x4040404040404040 |
141 | LABEL(topupper): |
142 | # ifdef USE_AVX |
143 | .quad 0x5a5a5a5a5a5a5a5a |
144 | .quad 0x5a5a5a5a5a5a5a5a |
145 | # else |
146 | .quad 0x5b5b5b5b5b5b5b5b |
147 | .quad 0x5b5b5b5b5b5b5b5b |
148 | # endif |
149 | LABEL(touppermask): |
150 | .quad 0x2020202020202020 |
151 | .quad 0x2020202020202020 |
152 | .previous |
153 | movdqa LABEL(belowupper)(%rip), %xmm4 |
154 | # define UCLOW_reg %xmm4 |
155 | movdqa LABEL(topupper)(%rip), %xmm5 |
156 | # define UCHIGH_reg %xmm5 |
157 | movdqa LABEL(touppermask)(%rip), %xmm6 |
158 | # define LCQWORD_reg %xmm6 |
159 | #endif |
160 | cmp $0x30, %ecx |
161 | ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ |
162 | cmp $0x30, %eax |
163 | ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ |
164 | movdqu (%rdi), %xmm1 |
165 | movdqu (%rsi), %xmm2 |
166 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
167 | # ifdef USE_AVX |
168 | # define TOLOWER(reg1, reg2) \ |
169 | vpcmpgtb UCLOW_reg, reg1, %xmm7; \ |
170 | vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ |
171 | vpcmpgtb UCLOW_reg, reg2, %xmm9; \ |
172 | vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ |
173 | vpandn %xmm7, %xmm8, %xmm8; \ |
174 | vpandn %xmm9, %xmm10, %xmm10; \ |
175 | vpand LCQWORD_reg, %xmm8, %xmm8; \ |
176 | vpand LCQWORD_reg, %xmm10, %xmm10; \ |
177 | vpor reg1, %xmm8, reg1; \ |
178 | vpor reg2, %xmm10, reg2 |
179 | # else |
180 | # define TOLOWER(reg1, reg2) \ |
181 | movdqa reg1, %xmm7; \ |
182 | movdqa UCHIGH_reg, %xmm8; \ |
183 | movdqa reg2, %xmm9; \ |
184 | movdqa UCHIGH_reg, %xmm10; \ |
185 | pcmpgtb UCLOW_reg, %xmm7; \ |
186 | pcmpgtb reg1, %xmm8; \ |
187 | pcmpgtb UCLOW_reg, %xmm9; \ |
188 | pcmpgtb reg2, %xmm10; \ |
189 | pand %xmm8, %xmm7; \ |
190 | pand %xmm10, %xmm9; \ |
191 | pand LCQWORD_reg, %xmm7; \ |
192 | pand LCQWORD_reg, %xmm9; \ |
193 | por %xmm7, reg1; \ |
194 | por %xmm9, reg2 |
195 | # endif |
196 | TOLOWER (%xmm1, %xmm2) |
197 | #else |
198 | # define TOLOWER(reg1, reg2) |
199 | #endif |
200 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ |
201 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
202 | pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ |
203 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
204 | pmovmskb %xmm1, %edx |
205 | sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ |
206 | jnz LABEL(less16bytes)/* If not, find different value or null char */ |
207 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
208 | sub $16, %r11 |
209 | jbe LABEL(strcmp_exitz)/* finish comparison */ |
210 | #endif |
211 | add $16, %rsi /* prepare to search next 16 bytes */ |
212 | add $16, %rdi /* prepare to search next 16 bytes */ |
213 | |
214 | /* |
215 | * Determine source and destination string offsets from 16-byte |
216 | * alignment. Use relative offset difference between the two to |
217 | * determine which case below to use. |
218 | */ |
219 | .p2align 4 |
220 | LABEL(crosscache): |
221 | and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ |
222 | and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ |
223 | mov $0xffff, %edx /* for equivalent offset */ |
224 | xor %r8d, %r8d |
225 | and $0xf, %ecx /* offset of rsi */ |
226 | and $0xf, %eax /* offset of rdi */ |
227 | pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ |
228 | cmp %eax, %ecx |
229 | je LABEL(ashr_0) /* rsi and rdi relative offset same */ |
230 | ja LABEL(bigger) |
231 | mov %edx, %r8d /* r8d is offset flag for exit tail */ |
232 | xchg %ecx, %eax |
233 | xchg %rsi, %rdi |
234 | LABEL(bigger): |
235 | movdqa (%rdi), %xmm2 |
236 | movdqa (%rsi), %xmm1 |
237 | lea 15(%rax), %r9 |
238 | sub %rcx, %r9 |
239 | lea LABEL(unaligned_table)(%rip), %r10 |
240 | movslq (%r10, %r9,4), %r9 |
241 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
242 | lea (%r10, %r9), %r10 |
243 | jmp *%r10 /* jump to corresponding case */ |
244 | |
245 | /* |
246 | * The following cases will be handled by ashr_0 |
247 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
248 | * n(0~15) n(0~15) 15(15+ n-n) ashr_0 |
249 | */ |
250 | .p2align 4 |
251 | LABEL(ashr_0): |
252 | |
253 | movdqa (%rsi), %xmm1 |
254 | pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ |
255 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
256 | pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ |
257 | #else |
258 | movdqa (%rdi), %xmm2 |
259 | TOLOWER (%xmm1, %xmm2) |
260 | pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ |
261 | #endif |
262 | psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ |
263 | pmovmskb %xmm1, %r9d |
264 | shr %cl, %edx /* adjust 0xffff for offset */ |
265 | shr %cl, %r9d /* adjust for 16-byte offset */ |
266 | sub %r9d, %edx |
267 | /* |
268 | * edx must be the same with r9d if in left byte (16-rcx) is equal to |
269 | * the start from (16-rax) and no null char was seen. |
270 | */ |
271 | jne LABEL(less32bytes) /* mismatch or null char */ |
272 | UPDATE_STRNCMP_COUNTER |
273 | mov $16, %rcx |
274 | mov $16, %r9 |
275 | |
276 | /* |
277 | * Now both strings are aligned at 16-byte boundary. Loop over strings |
278 | * checking 32-bytes per iteration. |
279 | */ |
280 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
281 | .p2align 4 |
282 | LABEL(ashr_0_use): |
283 | movdqa (%rdi,%rdx), %xmm0 |
284 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
285 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
286 | #else |
287 | movdqa (%rsi,%rdx), %xmm1 |
288 | TOLOWER (%xmm0, %xmm1) |
289 | pcmpistri $0x1a, %xmm1, %xmm0 |
290 | #endif |
291 | lea 16(%rdx), %rdx |
292 | jbe LABEL(ashr_0_exit_use) |
293 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
294 | sub $16, %r11 |
295 | jbe LABEL(strcmp_exitz) |
296 | #endif |
297 | |
298 | movdqa (%rdi,%rdx), %xmm0 |
299 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
300 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
301 | #else |
302 | movdqa (%rsi,%rdx), %xmm1 |
303 | TOLOWER (%xmm0, %xmm1) |
304 | pcmpistri $0x1a, %xmm1, %xmm0 |
305 | #endif |
306 | lea 16(%rdx), %rdx |
307 | jbe LABEL(ashr_0_exit_use) |
308 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
309 | sub $16, %r11 |
310 | jbe LABEL(strcmp_exitz) |
311 | #endif |
312 | jmp LABEL(ashr_0_use) |
313 | |
314 | |
315 | .p2align 4 |
316 | LABEL(ashr_0_exit_use): |
317 | jnc LABEL(strcmp_exitz) |
318 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
319 | sub %rcx, %r11 |
320 | jbe LABEL(strcmp_exitz) |
321 | #endif |
322 | lea -16(%rdx, %rcx), %rcx |
323 | movzbl (%rdi, %rcx), %eax |
324 | movzbl (%rsi, %rcx), %edx |
325 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
326 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
327 | movl (%rcx,%rax,4), %eax |
328 | movl (%rcx,%rdx,4), %edx |
329 | #endif |
330 | sub %edx, %eax |
331 | ret |
332 | |
333 | |
334 | |
335 | /* |
336 | * The following cases will be handled by ashr_1 |
337 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
338 | * n(15) n -15 0(15 +(n-15) - n) ashr_1 |
339 | */ |
340 | .p2align 4 |
341 | LABEL(ashr_1): |
342 | pslldq $15, D(%xmm2) /* shift first string to align with second */ |
343 | TOLOWER (%xmm1, %xmm2) |
344 | pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ |
345 | psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ |
346 | pmovmskb %xmm2, %r9d |
347 | shr %cl, %edx /* adjust 0xffff for offset */ |
348 | shr %cl, %r9d /* adjust for 16-byte offset */ |
349 | sub %r9d, %edx |
350 | jnz LABEL(less32bytes) /* mismatch or null char seen */ |
351 | movdqa (%rdi), %xmm3 |
352 | UPDATE_STRNCMP_COUNTER |
353 | |
354 | mov $16, %rcx /* index for loads*/ |
355 | mov $1, %r9d /* byte position left over from less32bytes case */ |
356 | /* |
357 | * Setup %r10 value allows us to detect crossing a page boundary. |
358 | * When %r10 goes positive we have crossed a page boundary and |
359 | * need to do a nibble. |
360 | */ |
361 | lea 1(%rdi), %r10 |
362 | and $0xfff, %r10 /* offset into 4K page */ |
363 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
364 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
365 | |
366 | .p2align 4 |
367 | LABEL(loop_ashr_1_use): |
368 | add $16, %r10 |
369 | jg LABEL(nibble_ashr_1_use) |
370 | |
371 | LABEL(nibble_ashr_1_restart_use): |
372 | movdqa (%rdi, %rdx), %xmm0 |
373 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
374 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
375 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
376 | #else |
377 | movdqa (%rsi,%rdx), %xmm1 |
378 | TOLOWER (%xmm0, %xmm1) |
379 | pcmpistri $0x1a, %xmm1, %xmm0 |
380 | #endif |
381 | jbe LABEL(exit_use) |
382 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
383 | sub $16, %r11 |
384 | jbe LABEL(strcmp_exitz) |
385 | #endif |
386 | |
387 | add $16, %rdx |
388 | add $16, %r10 |
389 | jg LABEL(nibble_ashr_1_use) |
390 | |
391 | movdqa (%rdi, %rdx), %xmm0 |
392 | palignr $1, -16(%rdi, %rdx), D(%xmm0) |
393 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
394 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
395 | #else |
396 | movdqa (%rsi,%rdx), %xmm1 |
397 | TOLOWER (%xmm0, %xmm1) |
398 | pcmpistri $0x1a, %xmm1, %xmm0 |
399 | #endif |
400 | jbe LABEL(exit_use) |
401 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
402 | sub $16, %r11 |
403 | jbe LABEL(strcmp_exitz) |
404 | #endif |
405 | add $16, %rdx |
406 | jmp LABEL(loop_ashr_1_use) |
407 | |
408 | .p2align 4 |
409 | LABEL(nibble_ashr_1_use): |
410 | sub $0x1000, %r10 |
411 | movdqa -16(%rdi, %rdx), %xmm0 |
412 | psrldq $1, D(%xmm0) |
413 | pcmpistri $0x3a,%xmm0, %xmm0 |
414 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
415 | cmp %r11, %rcx |
416 | jae LABEL(nibble_ashr_exit_use) |
417 | #endif |
418 | cmp $14, %ecx |
419 | ja LABEL(nibble_ashr_1_restart_use) |
420 | |
421 | jmp LABEL(nibble_ashr_exit_use) |
422 | |
423 | /* |
424 | * The following cases will be handled by ashr_2 |
425 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
426 | * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 |
427 | */ |
428 | .p2align 4 |
429 | LABEL(ashr_2): |
430 | pslldq $14, D(%xmm2) |
431 | TOLOWER (%xmm1, %xmm2) |
432 | pcmpeqb %xmm1, D(%xmm2) |
433 | psubb %xmm0, D(%xmm2) |
434 | pmovmskb %xmm2, %r9d |
435 | shr %cl, %edx |
436 | shr %cl, %r9d |
437 | sub %r9d, %edx |
438 | jnz LABEL(less32bytes) |
439 | movdqa (%rdi), %xmm3 |
440 | UPDATE_STRNCMP_COUNTER |
441 | |
442 | mov $16, %rcx /* index for loads */ |
443 | mov $2, %r9d /* byte position left over from less32bytes case */ |
444 | /* |
445 | * Setup %r10 value allows us to detect crossing a page boundary. |
446 | * When %r10 goes positive we have crossed a page boundary and |
447 | * need to do a nibble. |
448 | */ |
449 | lea 2(%rdi), %r10 |
450 | and $0xfff, %r10 /* offset into 4K page */ |
451 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
452 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
453 | |
454 | .p2align 4 |
455 | LABEL(loop_ashr_2_use): |
456 | add $16, %r10 |
457 | jg LABEL(nibble_ashr_2_use) |
458 | |
459 | LABEL(nibble_ashr_2_restart_use): |
460 | movdqa (%rdi, %rdx), %xmm0 |
461 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
462 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
463 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
464 | #else |
465 | movdqa (%rsi,%rdx), %xmm1 |
466 | TOLOWER (%xmm0, %xmm1) |
467 | pcmpistri $0x1a, %xmm1, %xmm0 |
468 | #endif |
469 | jbe LABEL(exit_use) |
470 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
471 | sub $16, %r11 |
472 | jbe LABEL(strcmp_exitz) |
473 | #endif |
474 | |
475 | add $16, %rdx |
476 | add $16, %r10 |
477 | jg LABEL(nibble_ashr_2_use) |
478 | |
479 | movdqa (%rdi, %rdx), %xmm0 |
480 | palignr $2, -16(%rdi, %rdx), D(%xmm0) |
481 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
482 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
483 | #else |
484 | movdqa (%rsi,%rdx), %xmm1 |
485 | TOLOWER (%xmm0, %xmm1) |
486 | pcmpistri $0x1a, %xmm1, %xmm0 |
487 | #endif |
488 | jbe LABEL(exit_use) |
489 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
490 | sub $16, %r11 |
491 | jbe LABEL(strcmp_exitz) |
492 | #endif |
493 | add $16, %rdx |
494 | jmp LABEL(loop_ashr_2_use) |
495 | |
496 | .p2align 4 |
497 | LABEL(nibble_ashr_2_use): |
498 | sub $0x1000, %r10 |
499 | movdqa -16(%rdi, %rdx), %xmm0 |
500 | psrldq $2, D(%xmm0) |
501 | pcmpistri $0x3a,%xmm0, %xmm0 |
502 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
503 | cmp %r11, %rcx |
504 | jae LABEL(nibble_ashr_exit_use) |
505 | #endif |
506 | cmp $13, %ecx |
507 | ja LABEL(nibble_ashr_2_restart_use) |
508 | |
509 | jmp LABEL(nibble_ashr_exit_use) |
510 | |
511 | /* |
512 | * The following cases will be handled by ashr_3 |
513 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
514 | * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 |
515 | */ |
516 | .p2align 4 |
517 | LABEL(ashr_3): |
518 | pslldq $13, D(%xmm2) |
519 | TOLOWER (%xmm1, %xmm2) |
520 | pcmpeqb %xmm1, D(%xmm2) |
521 | psubb %xmm0, D(%xmm2) |
522 | pmovmskb %xmm2, %r9d |
523 | shr %cl, %edx |
524 | shr %cl, %r9d |
525 | sub %r9d, %edx |
526 | jnz LABEL(less32bytes) |
527 | movdqa (%rdi), %xmm3 |
528 | |
529 | UPDATE_STRNCMP_COUNTER |
530 | |
531 | mov $16, %rcx /* index for loads */ |
532 | mov $3, %r9d /* byte position left over from less32bytes case */ |
533 | /* |
534 | * Setup %r10 value allows us to detect crossing a page boundary. |
535 | * When %r10 goes positive we have crossed a page boundary and |
536 | * need to do a nibble. |
537 | */ |
538 | lea 3(%rdi), %r10 |
539 | and $0xfff, %r10 /* offset into 4K page */ |
540 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
541 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
542 | |
543 | LABEL(loop_ashr_3_use): |
544 | add $16, %r10 |
545 | jg LABEL(nibble_ashr_3_use) |
546 | |
547 | LABEL(nibble_ashr_3_restart_use): |
548 | movdqa (%rdi, %rdx), %xmm0 |
549 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
550 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
551 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
552 | #else |
553 | movdqa (%rsi,%rdx), %xmm1 |
554 | TOLOWER (%xmm0, %xmm1) |
555 | pcmpistri $0x1a, %xmm1, %xmm0 |
556 | #endif |
557 | jbe LABEL(exit_use) |
558 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
559 | sub $16, %r11 |
560 | jbe LABEL(strcmp_exitz) |
561 | #endif |
562 | |
563 | add $16, %rdx |
564 | add $16, %r10 |
565 | jg LABEL(nibble_ashr_3_use) |
566 | |
567 | movdqa (%rdi, %rdx), %xmm0 |
568 | palignr $3, -16(%rdi, %rdx), D(%xmm0) |
569 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
570 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
571 | #else |
572 | movdqa (%rsi,%rdx), %xmm1 |
573 | TOLOWER (%xmm0, %xmm1) |
574 | pcmpistri $0x1a, %xmm1, %xmm0 |
575 | #endif |
576 | jbe LABEL(exit_use) |
577 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
578 | sub $16, %r11 |
579 | jbe LABEL(strcmp_exitz) |
580 | #endif |
581 | add $16, %rdx |
582 | jmp LABEL(loop_ashr_3_use) |
583 | |
584 | .p2align 4 |
585 | LABEL(nibble_ashr_3_use): |
586 | sub $0x1000, %r10 |
587 | movdqa -16(%rdi, %rdx), %xmm0 |
588 | psrldq $3, D(%xmm0) |
589 | pcmpistri $0x3a,%xmm0, %xmm0 |
590 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
591 | cmp %r11, %rcx |
592 | jae LABEL(nibble_ashr_exit_use) |
593 | #endif |
594 | cmp $12, %ecx |
595 | ja LABEL(nibble_ashr_3_restart_use) |
596 | |
597 | jmp LABEL(nibble_ashr_exit_use) |
598 | |
599 | /* |
600 | * The following cases will be handled by ashr_4 |
601 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
602 | * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 |
603 | */ |
604 | .p2align 4 |
605 | LABEL(ashr_4): |
606 | pslldq $12, D(%xmm2) |
607 | TOLOWER (%xmm1, %xmm2) |
608 | pcmpeqb %xmm1, D(%xmm2) |
609 | psubb %xmm0, D(%xmm2) |
610 | pmovmskb %xmm2, %r9d |
611 | shr %cl, %edx |
612 | shr %cl, %r9d |
613 | sub %r9d, %edx |
614 | jnz LABEL(less32bytes) |
615 | movdqa (%rdi), %xmm3 |
616 | |
617 | UPDATE_STRNCMP_COUNTER |
618 | |
619 | mov $16, %rcx /* index for loads */ |
620 | mov $4, %r9d /* byte position left over from less32bytes case */ |
621 | /* |
622 | * Setup %r10 value allows us to detect crossing a page boundary. |
623 | * When %r10 goes positive we have crossed a page boundary and |
624 | * need to do a nibble. |
625 | */ |
626 | lea 4(%rdi), %r10 |
627 | and $0xfff, %r10 /* offset into 4K page */ |
628 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
629 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
630 | |
631 | .p2align 4 |
632 | LABEL(loop_ashr_4_use): |
633 | add $16, %r10 |
634 | jg LABEL(nibble_ashr_4_use) |
635 | |
636 | LABEL(nibble_ashr_4_restart_use): |
637 | movdqa (%rdi, %rdx), %xmm0 |
638 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
639 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
640 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
641 | #else |
642 | movdqa (%rsi,%rdx), %xmm1 |
643 | TOLOWER (%xmm0, %xmm1) |
644 | pcmpistri $0x1a, %xmm1, %xmm0 |
645 | #endif |
646 | jbe LABEL(exit_use) |
647 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
648 | sub $16, %r11 |
649 | jbe LABEL(strcmp_exitz) |
650 | #endif |
651 | |
652 | add $16, %rdx |
653 | add $16, %r10 |
654 | jg LABEL(nibble_ashr_4_use) |
655 | |
656 | movdqa (%rdi, %rdx), %xmm0 |
657 | palignr $4, -16(%rdi, %rdx), D(%xmm0) |
658 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
659 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
660 | #else |
661 | movdqa (%rsi,%rdx), %xmm1 |
662 | TOLOWER (%xmm0, %xmm1) |
663 | pcmpistri $0x1a, %xmm1, %xmm0 |
664 | #endif |
665 | jbe LABEL(exit_use) |
666 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
667 | sub $16, %r11 |
668 | jbe LABEL(strcmp_exitz) |
669 | #endif |
670 | add $16, %rdx |
671 | jmp LABEL(loop_ashr_4_use) |
672 | |
673 | .p2align 4 |
674 | LABEL(nibble_ashr_4_use): |
675 | sub $0x1000, %r10 |
676 | movdqa -16(%rdi, %rdx), %xmm0 |
677 | psrldq $4, D(%xmm0) |
678 | pcmpistri $0x3a,%xmm0, %xmm0 |
679 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
680 | cmp %r11, %rcx |
681 | jae LABEL(nibble_ashr_exit_use) |
682 | #endif |
683 | cmp $11, %ecx |
684 | ja LABEL(nibble_ashr_4_restart_use) |
685 | |
686 | jmp LABEL(nibble_ashr_exit_use) |
687 | |
688 | /* |
689 | * The following cases will be handled by ashr_5 |
690 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
691 | * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 |
692 | */ |
693 | .p2align 4 |
694 | LABEL(ashr_5): |
695 | pslldq $11, D(%xmm2) |
696 | TOLOWER (%xmm1, %xmm2) |
697 | pcmpeqb %xmm1, D(%xmm2) |
698 | psubb %xmm0, D(%xmm2) |
699 | pmovmskb %xmm2, %r9d |
700 | shr %cl, %edx |
701 | shr %cl, %r9d |
702 | sub %r9d, %edx |
703 | jnz LABEL(less32bytes) |
704 | movdqa (%rdi), %xmm3 |
705 | |
706 | UPDATE_STRNCMP_COUNTER |
707 | |
708 | mov $16, %rcx /* index for loads */ |
709 | mov $5, %r9d /* byte position left over from less32bytes case */ |
710 | /* |
711 | * Setup %r10 value allows us to detect crossing a page boundary. |
712 | * When %r10 goes positive we have crossed a page boundary and |
713 | * need to do a nibble. |
714 | */ |
715 | lea 5(%rdi), %r10 |
716 | and $0xfff, %r10 /* offset into 4K page */ |
717 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
718 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
719 | |
720 | .p2align 4 |
721 | LABEL(loop_ashr_5_use): |
722 | add $16, %r10 |
723 | jg LABEL(nibble_ashr_5_use) |
724 | |
725 | LABEL(nibble_ashr_5_restart_use): |
726 | movdqa (%rdi, %rdx), %xmm0 |
727 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
728 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
729 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
730 | #else |
731 | movdqa (%rsi,%rdx), %xmm1 |
732 | TOLOWER (%xmm0, %xmm1) |
733 | pcmpistri $0x1a, %xmm1, %xmm0 |
734 | #endif |
735 | jbe LABEL(exit_use) |
736 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
737 | sub $16, %r11 |
738 | jbe LABEL(strcmp_exitz) |
739 | #endif |
740 | |
741 | add $16, %rdx |
742 | add $16, %r10 |
743 | jg LABEL(nibble_ashr_5_use) |
744 | |
745 | movdqa (%rdi, %rdx), %xmm0 |
746 | |
747 | palignr $5, -16(%rdi, %rdx), D(%xmm0) |
748 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
749 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
750 | #else |
751 | movdqa (%rsi,%rdx), %xmm1 |
752 | TOLOWER (%xmm0, %xmm1) |
753 | pcmpistri $0x1a, %xmm1, %xmm0 |
754 | #endif |
755 | jbe LABEL(exit_use) |
756 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
757 | sub $16, %r11 |
758 | jbe LABEL(strcmp_exitz) |
759 | #endif |
760 | add $16, %rdx |
761 | jmp LABEL(loop_ashr_5_use) |
762 | |
763 | .p2align 4 |
764 | LABEL(nibble_ashr_5_use): |
765 | sub $0x1000, %r10 |
766 | movdqa -16(%rdi, %rdx), %xmm0 |
767 | psrldq $5, D(%xmm0) |
768 | pcmpistri $0x3a,%xmm0, %xmm0 |
769 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
770 | cmp %r11, %rcx |
771 | jae LABEL(nibble_ashr_exit_use) |
772 | #endif |
773 | cmp $10, %ecx |
774 | ja LABEL(nibble_ashr_5_restart_use) |
775 | |
776 | jmp LABEL(nibble_ashr_exit_use) |
777 | |
778 | /* |
779 | * The following cases will be handled by ashr_6 |
780 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
781 | * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 |
782 | */ |
783 | .p2align 4 |
784 | LABEL(ashr_6): |
785 | pslldq $10, D(%xmm2) |
786 | TOLOWER (%xmm1, %xmm2) |
787 | pcmpeqb %xmm1, D(%xmm2) |
788 | psubb %xmm0, D(%xmm2) |
789 | pmovmskb %xmm2, %r9d |
790 | shr %cl, %edx |
791 | shr %cl, %r9d |
792 | sub %r9d, %edx |
793 | jnz LABEL(less32bytes) |
794 | movdqa (%rdi), %xmm3 |
795 | |
796 | UPDATE_STRNCMP_COUNTER |
797 | |
798 | mov $16, %rcx /* index for loads */ |
799 | mov $6, %r9d /* byte position left over from less32bytes case */ |
800 | /* |
801 | * Setup %r10 value allows us to detect crossing a page boundary. |
802 | * When %r10 goes positive we have crossed a page boundary and |
803 | * need to do a nibble. |
804 | */ |
805 | lea 6(%rdi), %r10 |
806 | and $0xfff, %r10 /* offset into 4K page */ |
807 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
808 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
809 | |
810 | .p2align 4 |
811 | LABEL(loop_ashr_6_use): |
812 | add $16, %r10 |
813 | jg LABEL(nibble_ashr_6_use) |
814 | |
815 | LABEL(nibble_ashr_6_restart_use): |
816 | movdqa (%rdi, %rdx), %xmm0 |
817 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
818 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
819 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
820 | #else |
821 | movdqa (%rsi,%rdx), %xmm1 |
822 | TOLOWER (%xmm0, %xmm1) |
823 | pcmpistri $0x1a, %xmm1, %xmm0 |
824 | #endif |
825 | jbe LABEL(exit_use) |
826 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
827 | sub $16, %r11 |
828 | jbe LABEL(strcmp_exitz) |
829 | #endif |
830 | |
831 | add $16, %rdx |
832 | add $16, %r10 |
833 | jg LABEL(nibble_ashr_6_use) |
834 | |
835 | movdqa (%rdi, %rdx), %xmm0 |
836 | palignr $6, -16(%rdi, %rdx), D(%xmm0) |
837 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
838 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
839 | #else |
840 | movdqa (%rsi,%rdx), %xmm1 |
841 | TOLOWER (%xmm0, %xmm1) |
842 | pcmpistri $0x1a, %xmm1, %xmm0 |
843 | #endif |
844 | jbe LABEL(exit_use) |
845 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
846 | sub $16, %r11 |
847 | jbe LABEL(strcmp_exitz) |
848 | #endif |
849 | add $16, %rdx |
850 | jmp LABEL(loop_ashr_6_use) |
851 | |
852 | .p2align 4 |
853 | LABEL(nibble_ashr_6_use): |
854 | sub $0x1000, %r10 |
855 | movdqa -16(%rdi, %rdx), %xmm0 |
856 | psrldq $6, D(%xmm0) |
857 | pcmpistri $0x3a,%xmm0, %xmm0 |
858 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
859 | cmp %r11, %rcx |
860 | jae LABEL(nibble_ashr_exit_use) |
861 | #endif |
862 | cmp $9, %ecx |
863 | ja LABEL(nibble_ashr_6_restart_use) |
864 | |
865 | jmp LABEL(nibble_ashr_exit_use) |
866 | |
867 | /* |
868 | * The following cases will be handled by ashr_7 |
869 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
870 | * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 |
871 | */ |
872 | .p2align 4 |
873 | LABEL(ashr_7): |
874 | pslldq $9, D(%xmm2) |
875 | TOLOWER (%xmm1, %xmm2) |
876 | pcmpeqb %xmm1, D(%xmm2) |
877 | psubb %xmm0, D(%xmm2) |
878 | pmovmskb %xmm2, %r9d |
879 | shr %cl, %edx |
880 | shr %cl, %r9d |
881 | sub %r9d, %edx |
882 | jnz LABEL(less32bytes) |
883 | movdqa (%rdi), %xmm3 |
884 | |
885 | UPDATE_STRNCMP_COUNTER |
886 | |
887 | mov $16, %rcx /* index for loads */ |
888 | mov $7, %r9d /* byte position left over from less32bytes case */ |
889 | /* |
890 | * Setup %r10 value allows us to detect crossing a page boundary. |
891 | * When %r10 goes positive we have crossed a page boundary and |
892 | * need to do a nibble. |
893 | */ |
894 | lea 7(%rdi), %r10 |
895 | and $0xfff, %r10 /* offset into 4K page */ |
896 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
897 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
898 | |
899 | .p2align 4 |
900 | LABEL(loop_ashr_7_use): |
901 | add $16, %r10 |
902 | jg LABEL(nibble_ashr_7_use) |
903 | |
904 | LABEL(nibble_ashr_7_restart_use): |
905 | movdqa (%rdi, %rdx), %xmm0 |
906 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
907 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
908 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
909 | #else |
910 | movdqa (%rsi,%rdx), %xmm1 |
911 | TOLOWER (%xmm0, %xmm1) |
912 | pcmpistri $0x1a, %xmm1, %xmm0 |
913 | #endif |
914 | jbe LABEL(exit_use) |
915 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
916 | sub $16, %r11 |
917 | jbe LABEL(strcmp_exitz) |
918 | #endif |
919 | |
920 | add $16, %rdx |
921 | add $16, %r10 |
922 | jg LABEL(nibble_ashr_7_use) |
923 | |
924 | movdqa (%rdi, %rdx), %xmm0 |
925 | palignr $7, -16(%rdi, %rdx), D(%xmm0) |
926 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
927 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
928 | #else |
929 | movdqa (%rsi,%rdx), %xmm1 |
930 | TOLOWER (%xmm0, %xmm1) |
931 | pcmpistri $0x1a, %xmm1, %xmm0 |
932 | #endif |
933 | jbe LABEL(exit_use) |
934 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
935 | sub $16, %r11 |
936 | jbe LABEL(strcmp_exitz) |
937 | #endif |
938 | add $16, %rdx |
939 | jmp LABEL(loop_ashr_7_use) |
940 | |
941 | .p2align 4 |
942 | LABEL(nibble_ashr_7_use): |
943 | sub $0x1000, %r10 |
944 | movdqa -16(%rdi, %rdx), %xmm0 |
945 | psrldq $7, D(%xmm0) |
946 | pcmpistri $0x3a,%xmm0, %xmm0 |
947 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
948 | cmp %r11, %rcx |
949 | jae LABEL(nibble_ashr_exit_use) |
950 | #endif |
951 | cmp $8, %ecx |
952 | ja LABEL(nibble_ashr_7_restart_use) |
953 | |
954 | jmp LABEL(nibble_ashr_exit_use) |
955 | |
956 | /* |
957 | * The following cases will be handled by ashr_8 |
958 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
959 | * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 |
960 | */ |
961 | .p2align 4 |
962 | LABEL(ashr_8): |
963 | pslldq $8, D(%xmm2) |
964 | TOLOWER (%xmm1, %xmm2) |
965 | pcmpeqb %xmm1, D(%xmm2) |
966 | psubb %xmm0, D(%xmm2) |
967 | pmovmskb %xmm2, %r9d |
968 | shr %cl, %edx |
969 | shr %cl, %r9d |
970 | sub %r9d, %edx |
971 | jnz LABEL(less32bytes) |
972 | movdqa (%rdi), %xmm3 |
973 | |
974 | UPDATE_STRNCMP_COUNTER |
975 | |
976 | mov $16, %rcx /* index for loads */ |
977 | mov $8, %r9d /* byte position left over from less32bytes case */ |
978 | /* |
979 | * Setup %r10 value allows us to detect crossing a page boundary. |
980 | * When %r10 goes positive we have crossed a page boundary and |
981 | * need to do a nibble. |
982 | */ |
983 | lea 8(%rdi), %r10 |
984 | and $0xfff, %r10 /* offset into 4K page */ |
985 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
986 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
987 | |
988 | .p2align 4 |
989 | LABEL(loop_ashr_8_use): |
990 | add $16, %r10 |
991 | jg LABEL(nibble_ashr_8_use) |
992 | |
993 | LABEL(nibble_ashr_8_restart_use): |
994 | movdqa (%rdi, %rdx), %xmm0 |
995 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
996 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
997 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
998 | #else |
999 | movdqa (%rsi,%rdx), %xmm1 |
1000 | TOLOWER (%xmm0, %xmm1) |
1001 | pcmpistri $0x1a, %xmm1, %xmm0 |
1002 | #endif |
1003 | jbe LABEL(exit_use) |
1004 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1005 | sub $16, %r11 |
1006 | jbe LABEL(strcmp_exitz) |
1007 | #endif |
1008 | |
1009 | add $16, %rdx |
1010 | add $16, %r10 |
1011 | jg LABEL(nibble_ashr_8_use) |
1012 | |
1013 | movdqa (%rdi, %rdx), %xmm0 |
1014 | palignr $8, -16(%rdi, %rdx), D(%xmm0) |
1015 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1016 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1017 | #else |
1018 | movdqa (%rsi,%rdx), %xmm1 |
1019 | TOLOWER (%xmm0, %xmm1) |
1020 | pcmpistri $0x1a, %xmm1, %xmm0 |
1021 | #endif |
1022 | jbe LABEL(exit_use) |
1023 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1024 | sub $16, %r11 |
1025 | jbe LABEL(strcmp_exitz) |
1026 | #endif |
1027 | add $16, %rdx |
1028 | jmp LABEL(loop_ashr_8_use) |
1029 | |
1030 | .p2align 4 |
1031 | LABEL(nibble_ashr_8_use): |
1032 | sub $0x1000, %r10 |
1033 | movdqa -16(%rdi, %rdx), %xmm0 |
1034 | psrldq $8, D(%xmm0) |
1035 | pcmpistri $0x3a,%xmm0, %xmm0 |
1036 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1037 | cmp %r11, %rcx |
1038 | jae LABEL(nibble_ashr_exit_use) |
1039 | #endif |
1040 | cmp $7, %ecx |
1041 | ja LABEL(nibble_ashr_8_restart_use) |
1042 | |
1043 | jmp LABEL(nibble_ashr_exit_use) |
1044 | |
1045 | /* |
1046 | * The following cases will be handled by ashr_9 |
1047 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1048 | * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 |
1049 | */ |
1050 | .p2align 4 |
1051 | LABEL(ashr_9): |
1052 | pslldq $7, D(%xmm2) |
1053 | TOLOWER (%xmm1, %xmm2) |
1054 | pcmpeqb %xmm1, D(%xmm2) |
1055 | psubb %xmm0, D(%xmm2) |
1056 | pmovmskb %xmm2, %r9d |
1057 | shr %cl, %edx |
1058 | shr %cl, %r9d |
1059 | sub %r9d, %edx |
1060 | jnz LABEL(less32bytes) |
1061 | movdqa (%rdi), %xmm3 |
1062 | |
1063 | UPDATE_STRNCMP_COUNTER |
1064 | |
1065 | mov $16, %rcx /* index for loads */ |
1066 | mov $9, %r9d /* byte position left over from less32bytes case */ |
1067 | /* |
1068 | * Setup %r10 value allows us to detect crossing a page boundary. |
1069 | * When %r10 goes positive we have crossed a page boundary and |
1070 | * need to do a nibble. |
1071 | */ |
1072 | lea 9(%rdi), %r10 |
1073 | and $0xfff, %r10 /* offset into 4K page */ |
1074 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1075 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1076 | |
1077 | .p2align 4 |
1078 | LABEL(loop_ashr_9_use): |
1079 | add $16, %r10 |
1080 | jg LABEL(nibble_ashr_9_use) |
1081 | |
1082 | LABEL(nibble_ashr_9_restart_use): |
1083 | movdqa (%rdi, %rdx), %xmm0 |
1084 | |
1085 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
1086 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1087 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1088 | #else |
1089 | movdqa (%rsi,%rdx), %xmm1 |
1090 | TOLOWER (%xmm0, %xmm1) |
1091 | pcmpistri $0x1a, %xmm1, %xmm0 |
1092 | #endif |
1093 | jbe LABEL(exit_use) |
1094 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1095 | sub $16, %r11 |
1096 | jbe LABEL(strcmp_exitz) |
1097 | #endif |
1098 | |
1099 | add $16, %rdx |
1100 | add $16, %r10 |
1101 | jg LABEL(nibble_ashr_9_use) |
1102 | |
1103 | movdqa (%rdi, %rdx), %xmm0 |
1104 | palignr $9, -16(%rdi, %rdx), D(%xmm0) |
1105 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1106 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1107 | #else |
1108 | movdqa (%rsi,%rdx), %xmm1 |
1109 | TOLOWER (%xmm0, %xmm1) |
1110 | pcmpistri $0x1a, %xmm1, %xmm0 |
1111 | #endif |
1112 | jbe LABEL(exit_use) |
1113 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1114 | sub $16, %r11 |
1115 | jbe LABEL(strcmp_exitz) |
1116 | #endif |
1117 | add $16, %rdx |
1118 | jmp LABEL(loop_ashr_9_use) |
1119 | |
1120 | .p2align 4 |
1121 | LABEL(nibble_ashr_9_use): |
1122 | sub $0x1000, %r10 |
1123 | movdqa -16(%rdi, %rdx), %xmm0 |
1124 | psrldq $9, D(%xmm0) |
1125 | pcmpistri $0x3a,%xmm0, %xmm0 |
1126 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1127 | cmp %r11, %rcx |
1128 | jae LABEL(nibble_ashr_exit_use) |
1129 | #endif |
1130 | cmp $6, %ecx |
1131 | ja LABEL(nibble_ashr_9_restart_use) |
1132 | |
1133 | jmp LABEL(nibble_ashr_exit_use) |
1134 | |
1135 | /* |
1136 | * The following cases will be handled by ashr_10 |
1137 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1138 | * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 |
1139 | */ |
1140 | .p2align 4 |
1141 | LABEL(ashr_10): |
1142 | pslldq $6, D(%xmm2) |
1143 | TOLOWER (%xmm1, %xmm2) |
1144 | pcmpeqb %xmm1, D(%xmm2) |
1145 | psubb %xmm0, D(%xmm2) |
1146 | pmovmskb %xmm2, %r9d |
1147 | shr %cl, %edx |
1148 | shr %cl, %r9d |
1149 | sub %r9d, %edx |
1150 | jnz LABEL(less32bytes) |
1151 | movdqa (%rdi), %xmm3 |
1152 | |
1153 | UPDATE_STRNCMP_COUNTER |
1154 | |
1155 | mov $16, %rcx /* index for loads */ |
1156 | mov $10, %r9d /* byte position left over from less32bytes case */ |
1157 | /* |
1158 | * Setup %r10 value allows us to detect crossing a page boundary. |
1159 | * When %r10 goes positive we have crossed a page boundary and |
1160 | * need to do a nibble. |
1161 | */ |
1162 | lea 10(%rdi), %r10 |
1163 | and $0xfff, %r10 /* offset into 4K page */ |
1164 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1165 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1166 | |
1167 | .p2align 4 |
1168 | LABEL(loop_ashr_10_use): |
1169 | add $16, %r10 |
1170 | jg LABEL(nibble_ashr_10_use) |
1171 | |
1172 | LABEL(nibble_ashr_10_restart_use): |
1173 | movdqa (%rdi, %rdx), %xmm0 |
1174 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
1175 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1176 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1177 | #else |
1178 | movdqa (%rsi,%rdx), %xmm1 |
1179 | TOLOWER (%xmm0, %xmm1) |
1180 | pcmpistri $0x1a, %xmm1, %xmm0 |
1181 | #endif |
1182 | jbe LABEL(exit_use) |
1183 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1184 | sub $16, %r11 |
1185 | jbe LABEL(strcmp_exitz) |
1186 | #endif |
1187 | |
1188 | add $16, %rdx |
1189 | add $16, %r10 |
1190 | jg LABEL(nibble_ashr_10_use) |
1191 | |
1192 | movdqa (%rdi, %rdx), %xmm0 |
1193 | palignr $10, -16(%rdi, %rdx), D(%xmm0) |
1194 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1195 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1196 | #else |
1197 | movdqa (%rsi,%rdx), %xmm1 |
1198 | TOLOWER (%xmm0, %xmm1) |
1199 | pcmpistri $0x1a, %xmm1, %xmm0 |
1200 | #endif |
1201 | jbe LABEL(exit_use) |
1202 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1203 | sub $16, %r11 |
1204 | jbe LABEL(strcmp_exitz) |
1205 | #endif |
1206 | add $16, %rdx |
1207 | jmp LABEL(loop_ashr_10_use) |
1208 | |
1209 | .p2align 4 |
1210 | LABEL(nibble_ashr_10_use): |
1211 | sub $0x1000, %r10 |
1212 | movdqa -16(%rdi, %rdx), %xmm0 |
1213 | psrldq $10, D(%xmm0) |
1214 | pcmpistri $0x3a,%xmm0, %xmm0 |
1215 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1216 | cmp %r11, %rcx |
1217 | jae LABEL(nibble_ashr_exit_use) |
1218 | #endif |
1219 | cmp $5, %ecx |
1220 | ja LABEL(nibble_ashr_10_restart_use) |
1221 | |
1222 | jmp LABEL(nibble_ashr_exit_use) |
1223 | |
1224 | /* |
1225 | * The following cases will be handled by ashr_11 |
1226 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1227 | * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 |
1228 | */ |
1229 | .p2align 4 |
1230 | LABEL(ashr_11): |
1231 | pslldq $5, D(%xmm2) |
1232 | TOLOWER (%xmm1, %xmm2) |
1233 | pcmpeqb %xmm1, D(%xmm2) |
1234 | psubb %xmm0, D(%xmm2) |
1235 | pmovmskb %xmm2, %r9d |
1236 | shr %cl, %edx |
1237 | shr %cl, %r9d |
1238 | sub %r9d, %edx |
1239 | jnz LABEL(less32bytes) |
1240 | movdqa (%rdi), %xmm3 |
1241 | |
1242 | UPDATE_STRNCMP_COUNTER |
1243 | |
1244 | mov $16, %rcx /* index for loads */ |
1245 | mov $11, %r9d /* byte position left over from less32bytes case */ |
1246 | /* |
1247 | * Setup %r10 value allows us to detect crossing a page boundary. |
1248 | * When %r10 goes positive we have crossed a page boundary and |
1249 | * need to do a nibble. |
1250 | */ |
1251 | lea 11(%rdi), %r10 |
1252 | and $0xfff, %r10 /* offset into 4K page */ |
1253 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1254 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1255 | |
1256 | .p2align 4 |
1257 | LABEL(loop_ashr_11_use): |
1258 | add $16, %r10 |
1259 | jg LABEL(nibble_ashr_11_use) |
1260 | |
1261 | LABEL(nibble_ashr_11_restart_use): |
1262 | movdqa (%rdi, %rdx), %xmm0 |
1263 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
1264 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1265 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1266 | #else |
1267 | movdqa (%rsi,%rdx), %xmm1 |
1268 | TOLOWER (%xmm0, %xmm1) |
1269 | pcmpistri $0x1a, %xmm1, %xmm0 |
1270 | #endif |
1271 | jbe LABEL(exit_use) |
1272 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1273 | sub $16, %r11 |
1274 | jbe LABEL(strcmp_exitz) |
1275 | #endif |
1276 | |
1277 | add $16, %rdx |
1278 | add $16, %r10 |
1279 | jg LABEL(nibble_ashr_11_use) |
1280 | |
1281 | movdqa (%rdi, %rdx), %xmm0 |
1282 | palignr $11, -16(%rdi, %rdx), D(%xmm0) |
1283 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1284 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1285 | #else |
1286 | movdqa (%rsi,%rdx), %xmm1 |
1287 | TOLOWER (%xmm0, %xmm1) |
1288 | pcmpistri $0x1a, %xmm1, %xmm0 |
1289 | #endif |
1290 | jbe LABEL(exit_use) |
1291 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1292 | sub $16, %r11 |
1293 | jbe LABEL(strcmp_exitz) |
1294 | #endif |
1295 | add $16, %rdx |
1296 | jmp LABEL(loop_ashr_11_use) |
1297 | |
1298 | .p2align 4 |
1299 | LABEL(nibble_ashr_11_use): |
1300 | sub $0x1000, %r10 |
1301 | movdqa -16(%rdi, %rdx), %xmm0 |
1302 | psrldq $11, D(%xmm0) |
1303 | pcmpistri $0x3a,%xmm0, %xmm0 |
1304 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1305 | cmp %r11, %rcx |
1306 | jae LABEL(nibble_ashr_exit_use) |
1307 | #endif |
1308 | cmp $4, %ecx |
1309 | ja LABEL(nibble_ashr_11_restart_use) |
1310 | |
1311 | jmp LABEL(nibble_ashr_exit_use) |
1312 | |
1313 | /* |
1314 | * The following cases will be handled by ashr_12 |
1315 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1316 | * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 |
1317 | */ |
1318 | .p2align 4 |
1319 | LABEL(ashr_12): |
1320 | pslldq $4, D(%xmm2) |
1321 | TOLOWER (%xmm1, %xmm2) |
1322 | pcmpeqb %xmm1, D(%xmm2) |
1323 | psubb %xmm0, D(%xmm2) |
1324 | pmovmskb %xmm2, %r9d |
1325 | shr %cl, %edx |
1326 | shr %cl, %r9d |
1327 | sub %r9d, %edx |
1328 | jnz LABEL(less32bytes) |
1329 | movdqa (%rdi), %xmm3 |
1330 | |
1331 | UPDATE_STRNCMP_COUNTER |
1332 | |
1333 | mov $16, %rcx /* index for loads */ |
1334 | mov $12, %r9d /* byte position left over from less32bytes case */ |
1335 | /* |
1336 | * Setup %r10 value allows us to detect crossing a page boundary. |
1337 | * When %r10 goes positive we have crossed a page boundary and |
1338 | * need to do a nibble. |
1339 | */ |
1340 | lea 12(%rdi), %r10 |
1341 | and $0xfff, %r10 /* offset into 4K page */ |
1342 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1343 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1344 | |
1345 | .p2align 4 |
1346 | LABEL(loop_ashr_12_use): |
1347 | add $16, %r10 |
1348 | jg LABEL(nibble_ashr_12_use) |
1349 | |
1350 | LABEL(nibble_ashr_12_restart_use): |
1351 | movdqa (%rdi, %rdx), %xmm0 |
1352 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
1353 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1354 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1355 | #else |
1356 | movdqa (%rsi,%rdx), %xmm1 |
1357 | TOLOWER (%xmm0, %xmm1) |
1358 | pcmpistri $0x1a, %xmm1, %xmm0 |
1359 | #endif |
1360 | jbe LABEL(exit_use) |
1361 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1362 | sub $16, %r11 |
1363 | jbe LABEL(strcmp_exitz) |
1364 | #endif |
1365 | |
1366 | add $16, %rdx |
1367 | add $16, %r10 |
1368 | jg LABEL(nibble_ashr_12_use) |
1369 | |
1370 | movdqa (%rdi, %rdx), %xmm0 |
1371 | palignr $12, -16(%rdi, %rdx), D(%xmm0) |
1372 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1373 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1374 | #else |
1375 | movdqa (%rsi,%rdx), %xmm1 |
1376 | TOLOWER (%xmm0, %xmm1) |
1377 | pcmpistri $0x1a, %xmm1, %xmm0 |
1378 | #endif |
1379 | jbe LABEL(exit_use) |
1380 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1381 | sub $16, %r11 |
1382 | jbe LABEL(strcmp_exitz) |
1383 | #endif |
1384 | add $16, %rdx |
1385 | jmp LABEL(loop_ashr_12_use) |
1386 | |
1387 | .p2align 4 |
1388 | LABEL(nibble_ashr_12_use): |
1389 | sub $0x1000, %r10 |
1390 | movdqa -16(%rdi, %rdx), %xmm0 |
1391 | psrldq $12, D(%xmm0) |
1392 | pcmpistri $0x3a,%xmm0, %xmm0 |
1393 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1394 | cmp %r11, %rcx |
1395 | jae LABEL(nibble_ashr_exit_use) |
1396 | #endif |
1397 | cmp $3, %ecx |
1398 | ja LABEL(nibble_ashr_12_restart_use) |
1399 | |
1400 | jmp LABEL(nibble_ashr_exit_use) |
1401 | |
1402 | /* |
1403 | * The following cases will be handled by ashr_13 |
1404 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1405 | * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 |
1406 | */ |
1407 | .p2align 4 |
1408 | LABEL(ashr_13): |
1409 | pslldq $3, D(%xmm2) |
1410 | TOLOWER (%xmm1, %xmm2) |
1411 | pcmpeqb %xmm1, D(%xmm2) |
1412 | psubb %xmm0, D(%xmm2) |
1413 | pmovmskb %xmm2, %r9d |
1414 | shr %cl, %edx |
1415 | shr %cl, %r9d |
1416 | sub %r9d, %edx |
1417 | jnz LABEL(less32bytes) |
1418 | movdqa (%rdi), %xmm3 |
1419 | |
1420 | UPDATE_STRNCMP_COUNTER |
1421 | |
1422 | mov $16, %rcx /* index for loads */ |
1423 | mov $13, %r9d /* byte position left over from less32bytes case */ |
1424 | /* |
1425 | * Setup %r10 value allows us to detect crossing a page boundary. |
1426 | * When %r10 goes positive we have crossed a page boundary and |
1427 | * need to do a nibble. |
1428 | */ |
1429 | lea 13(%rdi), %r10 |
1430 | and $0xfff, %r10 /* offset into 4K page */ |
1431 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1432 | |
1433 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1434 | |
1435 | .p2align 4 |
1436 | LABEL(loop_ashr_13_use): |
1437 | add $16, %r10 |
1438 | jg LABEL(nibble_ashr_13_use) |
1439 | |
1440 | LABEL(nibble_ashr_13_restart_use): |
1441 | movdqa (%rdi, %rdx), %xmm0 |
1442 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
1443 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1444 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1445 | #else |
1446 | movdqa (%rsi,%rdx), %xmm1 |
1447 | TOLOWER (%xmm0, %xmm1) |
1448 | pcmpistri $0x1a, %xmm1, %xmm0 |
1449 | #endif |
1450 | jbe LABEL(exit_use) |
1451 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1452 | sub $16, %r11 |
1453 | jbe LABEL(strcmp_exitz) |
1454 | #endif |
1455 | |
1456 | add $16, %rdx |
1457 | add $16, %r10 |
1458 | jg LABEL(nibble_ashr_13_use) |
1459 | |
1460 | movdqa (%rdi, %rdx), %xmm0 |
1461 | palignr $13, -16(%rdi, %rdx), D(%xmm0) |
1462 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1463 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1464 | #else |
1465 | movdqa (%rsi,%rdx), %xmm1 |
1466 | TOLOWER (%xmm0, %xmm1) |
1467 | pcmpistri $0x1a, %xmm1, %xmm0 |
1468 | #endif |
1469 | jbe LABEL(exit_use) |
1470 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1471 | sub $16, %r11 |
1472 | jbe LABEL(strcmp_exitz) |
1473 | #endif |
1474 | add $16, %rdx |
1475 | jmp LABEL(loop_ashr_13_use) |
1476 | |
1477 | .p2align 4 |
1478 | LABEL(nibble_ashr_13_use): |
1479 | sub $0x1000, %r10 |
1480 | movdqa -16(%rdi, %rdx), %xmm0 |
1481 | psrldq $13, D(%xmm0) |
1482 | pcmpistri $0x3a,%xmm0, %xmm0 |
1483 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1484 | cmp %r11, %rcx |
1485 | jae LABEL(nibble_ashr_exit_use) |
1486 | #endif |
1487 | cmp $2, %ecx |
1488 | ja LABEL(nibble_ashr_13_restart_use) |
1489 | |
1490 | jmp LABEL(nibble_ashr_exit_use) |
1491 | |
1492 | /* |
1493 | * The following cases will be handled by ashr_14 |
1494 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1495 | * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 |
1496 | */ |
1497 | .p2align 4 |
1498 | LABEL(ashr_14): |
1499 | pslldq $2, D(%xmm2) |
1500 | TOLOWER (%xmm1, %xmm2) |
1501 | pcmpeqb %xmm1, D(%xmm2) |
1502 | psubb %xmm0, D(%xmm2) |
1503 | pmovmskb %xmm2, %r9d |
1504 | shr %cl, %edx |
1505 | shr %cl, %r9d |
1506 | sub %r9d, %edx |
1507 | jnz LABEL(less32bytes) |
1508 | movdqa (%rdi), %xmm3 |
1509 | |
1510 | UPDATE_STRNCMP_COUNTER |
1511 | |
1512 | mov $16, %rcx /* index for loads */ |
1513 | mov $14, %r9d /* byte position left over from less32bytes case */ |
1514 | /* |
1515 | * Setup %r10 value allows us to detect crossing a page boundary. |
1516 | * When %r10 goes positive we have crossed a page boundary and |
1517 | * need to do a nibble. |
1518 | */ |
1519 | lea 14(%rdi), %r10 |
1520 | and $0xfff, %r10 /* offset into 4K page */ |
1521 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1522 | |
1523 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1524 | |
1525 | .p2align 4 |
1526 | LABEL(loop_ashr_14_use): |
1527 | add $16, %r10 |
1528 | jg LABEL(nibble_ashr_14_use) |
1529 | |
1530 | LABEL(nibble_ashr_14_restart_use): |
1531 | movdqa (%rdi, %rdx), %xmm0 |
1532 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
1533 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1534 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1535 | #else |
1536 | movdqa (%rsi,%rdx), %xmm1 |
1537 | TOLOWER (%xmm0, %xmm1) |
1538 | pcmpistri $0x1a, %xmm1, %xmm0 |
1539 | #endif |
1540 | jbe LABEL(exit_use) |
1541 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1542 | sub $16, %r11 |
1543 | jbe LABEL(strcmp_exitz) |
1544 | #endif |
1545 | |
1546 | add $16, %rdx |
1547 | add $16, %r10 |
1548 | jg LABEL(nibble_ashr_14_use) |
1549 | |
1550 | movdqa (%rdi, %rdx), %xmm0 |
1551 | palignr $14, -16(%rdi, %rdx), D(%xmm0) |
1552 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1553 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1554 | #else |
1555 | movdqa (%rsi,%rdx), %xmm1 |
1556 | TOLOWER (%xmm0, %xmm1) |
1557 | pcmpistri $0x1a, %xmm1, %xmm0 |
1558 | #endif |
1559 | jbe LABEL(exit_use) |
1560 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1561 | sub $16, %r11 |
1562 | jbe LABEL(strcmp_exitz) |
1563 | #endif |
1564 | add $16, %rdx |
1565 | jmp LABEL(loop_ashr_14_use) |
1566 | |
1567 | .p2align 4 |
1568 | LABEL(nibble_ashr_14_use): |
1569 | sub $0x1000, %r10 |
1570 | movdqa -16(%rdi, %rdx), %xmm0 |
1571 | psrldq $14, D(%xmm0) |
1572 | pcmpistri $0x3a,%xmm0, %xmm0 |
1573 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1574 | cmp %r11, %rcx |
1575 | jae LABEL(nibble_ashr_exit_use) |
1576 | #endif |
1577 | cmp $1, %ecx |
1578 | ja LABEL(nibble_ashr_14_restart_use) |
1579 | |
1580 | jmp LABEL(nibble_ashr_exit_use) |
1581 | |
1582 | /* |
1583 | * The following cases will be handled by ashr_15 |
1584 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1585 | * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 |
1586 | */ |
1587 | .p2align 4 |
1588 | LABEL(ashr_15): |
1589 | pslldq $1, D(%xmm2) |
1590 | TOLOWER (%xmm1, %xmm2) |
1591 | pcmpeqb %xmm1, D(%xmm2) |
1592 | psubb %xmm0, D(%xmm2) |
1593 | pmovmskb %xmm2, %r9d |
1594 | shr %cl, %edx |
1595 | shr %cl, %r9d |
1596 | sub %r9d, %edx |
1597 | jnz LABEL(less32bytes) |
1598 | |
1599 | movdqa (%rdi), %xmm3 |
1600 | |
1601 | UPDATE_STRNCMP_COUNTER |
1602 | |
1603 | mov $16, %rcx /* index for loads */ |
1604 | mov $15, %r9d /* byte position left over from less32bytes case */ |
1605 | /* |
1606 | * Setup %r10 value allows us to detect crossing a page boundary. |
1607 | * When %r10 goes positive we have crossed a page boundary and |
1608 | * need to do a nibble. |
1609 | */ |
1610 | lea 15(%rdi), %r10 |
1611 | and $0xfff, %r10 /* offset into 4K page */ |
1612 | |
1613 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1614 | |
1615 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1616 | |
1617 | .p2align 4 |
1618 | LABEL(loop_ashr_15_use): |
1619 | add $16, %r10 |
1620 | jg LABEL(nibble_ashr_15_use) |
1621 | |
1622 | LABEL(nibble_ashr_15_restart_use): |
1623 | movdqa (%rdi, %rdx), %xmm0 |
1624 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
1625 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1626 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1627 | #else |
1628 | movdqa (%rsi,%rdx), %xmm1 |
1629 | TOLOWER (%xmm0, %xmm1) |
1630 | pcmpistri $0x1a, %xmm1, %xmm0 |
1631 | #endif |
1632 | jbe LABEL(exit_use) |
1633 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1634 | sub $16, %r11 |
1635 | jbe LABEL(strcmp_exitz) |
1636 | #endif |
1637 | |
1638 | add $16, %rdx |
1639 | add $16, %r10 |
1640 | jg LABEL(nibble_ashr_15_use) |
1641 | |
1642 | movdqa (%rdi, %rdx), %xmm0 |
1643 | palignr $15, -16(%rdi, %rdx), D(%xmm0) |
1644 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1645 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1646 | #else |
1647 | movdqa (%rsi,%rdx), %xmm1 |
1648 | TOLOWER (%xmm0, %xmm1) |
1649 | pcmpistri $0x1a, %xmm1, %xmm0 |
1650 | #endif |
1651 | jbe LABEL(exit_use) |
1652 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1653 | sub $16, %r11 |
1654 | jbe LABEL(strcmp_exitz) |
1655 | #endif |
1656 | add $16, %rdx |
1657 | jmp LABEL(loop_ashr_15_use) |
1658 | |
1659 | .p2align 4 |
1660 | LABEL(nibble_ashr_15_use): |
1661 | sub $0x1000, %r10 |
1662 | movdqa -16(%rdi, %rdx), %xmm0 |
1663 | psrldq $15, D(%xmm0) |
1664 | pcmpistri $0x3a,%xmm0, %xmm0 |
1665 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1666 | cmp %r11, %rcx |
1667 | jae LABEL(nibble_ashr_exit_use) |
1668 | #endif |
1669 | cmp $0, %ecx |
1670 | ja LABEL(nibble_ashr_15_restart_use) |
1671 | |
1672 | LABEL(nibble_ashr_exit_use): |
1673 | #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1674 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
1675 | #else |
1676 | movdqa (%rsi,%rdx), %xmm1 |
1677 | TOLOWER (%xmm0, %xmm1) |
1678 | pcmpistri $0x1a, %xmm1, %xmm0 |
1679 | #endif |
1680 | .p2align 4 |
1681 | LABEL(exit_use): |
1682 | jnc LABEL(strcmp_exitz) |
1683 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1684 | sub %rcx, %r11 |
1685 | jbe LABEL(strcmp_exitz) |
1686 | #endif |
1687 | add %rcx, %rdx |
1688 | lea -16(%rdi, %r9), %rdi |
1689 | movzbl (%rdi, %rdx), %eax |
1690 | movzbl (%rsi, %rdx), %edx |
1691 | test %r8d, %r8d |
1692 | jz LABEL(ret_use) |
1693 | xchg %eax, %edx |
1694 | LABEL(ret_use): |
1695 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1696 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
1697 | movl (%rcx,%rdx,4), %edx |
1698 | movl (%rcx,%rax,4), %eax |
1699 | #endif |
1700 | |
1701 | sub %edx, %eax |
1702 | ret |
1703 | |
1704 | LABEL(less32bytes): |
1705 | lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ |
1706 | lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ |
1707 | test %r8d, %r8d |
1708 | jz LABEL(ret) |
1709 | xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ |
1710 | |
1711 | .p2align 4 |
1712 | LABEL(ret): |
1713 | LABEL(less16bytes): |
1714 | bsf %rdx, %rdx /* find and store bit index in %rdx */ |
1715 | |
1716 | #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1717 | sub %rdx, %r11 |
1718 | jbe LABEL(strcmp_exitz) |
1719 | #endif |
1720 | movzbl (%rsi, %rdx), %ecx |
1721 | movzbl (%rdi, %rdx), %eax |
1722 | |
1723 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1724 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1725 | movl (%rdx,%rcx,4), %ecx |
1726 | movl (%rdx,%rax,4), %eax |
1727 | #endif |
1728 | |
1729 | sub %ecx, %eax |
1730 | ret |
1731 | |
1732 | LABEL(strcmp_exitz): |
1733 | xor %eax, %eax |
1734 | ret |
1735 | |
1736 | .p2align 4 |
1737 | // XXX Same as code above |
1738 | LABEL(Byte0): |
1739 | movzx (%rsi), %ecx |
1740 | movzx (%rdi), %eax |
1741 | |
1742 | #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1743 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1744 | movl (%rdx,%rcx,4), %ecx |
1745 | movl (%rdx,%rax,4), %eax |
1746 | #endif |
1747 | |
1748 | sub %ecx, %eax |
1749 | ret |
1750 | cfi_endproc |
1751 | .size STRCMP_SSE42, .-STRCMP_SSE42 |
1752 | |
1753 | #undef UCLOW_reg |
1754 | #undef UCHIGH_reg |
1755 | #undef LCQWORD_reg |
1756 | #undef TOLOWER |
1757 | |
1758 | /* Put all SSE 4.2 functions together. */ |
1759 | .section .rodata.SECTION,"a" ,@progbits |
1760 | .p2align 3 |
1761 | LABEL(unaligned_table): |
1762 | .int LABEL(ashr_1) - LABEL(unaligned_table) |
1763 | .int LABEL(ashr_2) - LABEL(unaligned_table) |
1764 | .int LABEL(ashr_3) - LABEL(unaligned_table) |
1765 | .int LABEL(ashr_4) - LABEL(unaligned_table) |
1766 | .int LABEL(ashr_5) - LABEL(unaligned_table) |
1767 | .int LABEL(ashr_6) - LABEL(unaligned_table) |
1768 | .int LABEL(ashr_7) - LABEL(unaligned_table) |
1769 | .int LABEL(ashr_8) - LABEL(unaligned_table) |
1770 | .int LABEL(ashr_9) - LABEL(unaligned_table) |
1771 | .int LABEL(ashr_10) - LABEL(unaligned_table) |
1772 | .int LABEL(ashr_11) - LABEL(unaligned_table) |
1773 | .int LABEL(ashr_12) - LABEL(unaligned_table) |
1774 | .int LABEL(ashr_13) - LABEL(unaligned_table) |
1775 | .int LABEL(ashr_14) - LABEL(unaligned_table) |
1776 | .int LABEL(ashr_15) - LABEL(unaligned_table) |
1777 | .int LABEL(ashr_0) - LABEL(unaligned_table) |
1778 | |
1779 | #undef LABEL |
1780 | #undef GLABEL |
1781 | #undef SECTION |
1782 | #undef movdqa |
1783 | #undef movdqu |
1784 | #undef pmovmskb |
1785 | #undef pcmpistri |
1786 | #undef psubb |
1787 | #undef pcmpeqb |
1788 | #undef psrldq |
1789 | #undef pslldq |
1790 | #undef palignr |
1791 | #undef pxor |
1792 | #undef D |
1793 | |