1 | /* strcmp optimized with SSE4.2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (2) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # define STRCMP_ISA _sse42 |
26 | # include "strcmp-naming.h" |
27 | |
28 | # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
29 | # include "locale-defines.h" |
30 | # endif |
31 | |
32 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
33 | /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz |
34 | if the new counter > the old one or is 0. */ |
35 | # define UPDATE_STRNCMP_COUNTER \ |
36 | /* calculate left number to compare */ \ |
37 | lea -16(%rcx, %r11), %r9; \ |
38 | cmp %r9, %r11; \ |
39 | jb LABEL(strcmp_exitz); \ |
40 | test %r9, %r9; \ |
41 | je LABEL(strcmp_exitz); \ |
42 | mov %r9, %r11 |
43 | # else |
44 | # define UPDATE_STRNCMP_COUNTER |
45 | # endif |
46 | |
47 | # define SECTION sse4.2 |
48 | |
49 | # define LABEL(l) .L##l |
50 | |
51 | /* We use 0x1a: |
52 | _SIDD_SBYTE_OPS |
53 | | _SIDD_CMP_EQUAL_EACH |
54 | | _SIDD_NEGATIVE_POLARITY |
55 | | _SIDD_LEAST_SIGNIFICANT |
56 | on pcmpistri to find out if two 16byte data elements are the same |
57 | and the offset of the first different byte. There are 4 cases: |
58 | |
59 | 1. Both 16byte data elements are valid and identical. |
60 | 2. Both 16byte data elements have EOS and identical. |
61 | 3. Both 16byte data elements are valid and they differ at offset X. |
62 | 4. At least one 16byte data element has EOS at offset X. Two 16byte |
63 | data elements must differ at or before offset X. |
64 | |
65 | Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: |
66 | |
67 | case ECX CFlag ZFlag SFlag |
68 | 1 16 0 0 0 |
69 | 2 16 0 1 1 |
70 | 3 X 1 0 0 |
71 | 4 0 <= X 1 0/1 0/1 |
72 | |
73 | We exit from the loop for cases 2, 3 and 4 with jbe which branches |
74 | when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for |
75 | case 2. */ |
76 | |
77 | /* Put all SSE 4.2 functions together. */ |
78 | .section .text.SECTION,"ax" ,@progbits |
79 | .align 16 |
80 | .type STRCMP, @function |
81 | .globl STRCMP |
82 | # ifdef USE_AS_STRCASECMP_L |
83 | ENTRY (STRCASECMP) |
84 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
85 | mov %fs:(%rax),%RDX_LP |
86 | |
87 | /* Either 1 or 5 bytes (depending if CET is enabled). */ |
88 | .p2align 4 |
89 | END (STRCASECMP) |
90 | /* FALLTHROUGH to strcasecmp_l. */ |
91 | # endif |
92 | # ifdef USE_AS_STRNCASECMP_L |
93 | ENTRY (STRCASECMP) |
94 | movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
95 | mov %fs:(%rax),%RCX_LP |
96 | |
97 | /* Either 1 or 5 bytes (depending if CET is enabled). */ |
98 | .p2align 4 |
99 | END (STRCASECMP) |
100 | /* FALLTHROUGH to strncasecmp_l. */ |
101 | # endif |
102 | |
103 | |
104 | # define arg arg |
105 | |
106 | STRCMP: |
107 | cfi_startproc |
108 | _CET_ENDBR |
109 | CALL_MCOUNT |
110 | |
111 | /* |
112 | * This implementation uses SSE to compare up to 16 bytes at a time. |
113 | */ |
114 | # ifdef USE_AS_STRCASECMP_L |
115 | /* We have to fall back on the C implementation for locales |
116 | with encodings not matching ASCII for single bytes. */ |
117 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
118 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP |
119 | # else |
120 | mov (%rdx), %RAX_LP |
121 | # endif |
122 | testb $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
123 | jne __strcasecmp_l_nonascii |
124 | # endif |
125 | # ifdef USE_AS_STRNCASECMP_L |
126 | /* We have to fall back on the C implementation for locales |
127 | with encodings not matching ASCII for single bytes. */ |
128 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
129 | mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP |
130 | # else |
131 | mov (%rcx), %RAX_LP |
132 | # endif |
133 | testb $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) |
134 | jne __strncasecmp_l_nonascii |
135 | # endif |
136 | |
137 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
138 | test %RDX_LP, %RDX_LP |
139 | je LABEL(strcmp_exitz) |
140 | cmp $1, %RDX_LP |
141 | je LABEL(Byte0) |
142 | mov %RDX_LP, %R11_LP |
143 | # endif |
144 | mov %esi, %ecx |
145 | mov %edi, %eax |
146 | /* Use 64bit AND here to avoid long NOP padding. */ |
147 | and $0x3f, %rcx /* rsi alignment in cache line */ |
148 | and $0x3f, %rax /* rdi alignment in cache line */ |
149 | # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
150 | .section .rodata.cst16,"aM" ,@progbits,16 |
151 | .align 16 |
152 | LABEL(lcase_min): |
153 | .quad 0x3f3f3f3f3f3f3f3f |
154 | .quad 0x3f3f3f3f3f3f3f3f |
155 | LABEL(lcase_max): |
156 | .quad 0x9999999999999999 |
157 | .quad 0x9999999999999999 |
158 | LABEL(case_add): |
159 | .quad 0x2020202020202020 |
160 | .quad 0x2020202020202020 |
161 | .previous |
162 | movdqa LABEL(lcase_min)(%rip), %xmm4 |
163 | # define LCASE_MIN_reg %xmm4 |
164 | movdqa LABEL(lcase_max)(%rip), %xmm5 |
165 | # define LCASE_MAX_reg %xmm5 |
166 | movdqa LABEL(case_add)(%rip), %xmm6 |
167 | # define CASE_ADD_reg %xmm6 |
168 | # endif |
169 | cmp $0x30, %ecx |
170 | ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ |
171 | cmp $0x30, %eax |
172 | ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ |
173 | movdqu (%rdi), %xmm1 |
174 | movdqu (%rsi), %xmm2 |
175 | # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
176 | # define TOLOWER(reg1, reg2) \ |
177 | movdqa LCASE_MIN_reg, %xmm7; \ |
178 | movdqa LCASE_MIN_reg, %xmm8; \ |
179 | paddb reg1, %xmm7; \ |
180 | paddb reg2, %xmm8; \ |
181 | pcmpgtb LCASE_MAX_reg, %xmm7; \ |
182 | pcmpgtb LCASE_MAX_reg, %xmm8; \ |
183 | pandn CASE_ADD_reg, %xmm7; \ |
184 | pandn CASE_ADD_reg, %xmm8; \ |
185 | paddb %xmm7, reg1; \ |
186 | paddb %xmm8, reg2 |
187 | |
188 | TOLOWER (%xmm1, %xmm2) |
189 | # else |
190 | # define TOLOWER(reg1, reg2) |
191 | # endif |
192 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ |
193 | pcmpeqb %xmm1, %xmm0 /* Any null chars? */ |
194 | pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ |
195 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
196 | pmovmskb %xmm1, %edx |
197 | sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ |
198 | jnz LABEL(less16bytes)/* If not, find different value or null char */ |
199 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
200 | sub $16, %r11 |
201 | jbe LABEL(strcmp_exitz)/* finish comparison */ |
202 | # endif |
203 | add $16, %rsi /* prepare to search next 16 bytes */ |
204 | add $16, %rdi /* prepare to search next 16 bytes */ |
205 | |
206 | /* |
207 | * Determine source and destination string offsets from 16-byte |
208 | * alignment. Use relative offset difference between the two to |
209 | * determine which case below to use. |
210 | */ |
211 | .p2align 4 |
212 | LABEL(crosscache): |
213 | and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ |
214 | and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ |
215 | mov $0xffff, %edx /* for equivalent offset */ |
216 | xor %r8d, %r8d |
217 | and $0xf, %ecx /* offset of rsi */ |
218 | and $0xf, %eax /* offset of rdi */ |
219 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ |
220 | cmp %eax, %ecx |
221 | je LABEL(ashr_0) /* rsi and rdi relative offset same */ |
222 | ja LABEL(bigger) |
223 | mov %edx, %r8d /* r8d is offset flag for exit tail */ |
224 | xchg %ecx, %eax |
225 | xchg %rsi, %rdi |
226 | LABEL(bigger): |
227 | movdqa (%rdi), %xmm2 |
228 | movdqa (%rsi), %xmm1 |
229 | lea 15(%rax), %r9 |
230 | sub %rcx, %r9 |
231 | lea LABEL(unaligned_table)(%rip), %r10 |
232 | movslq (%r10, %r9,4), %r9 |
233 | pcmpeqb %xmm1, %xmm0 /* Any null chars? */ |
234 | lea (%r10, %r9), %r10 |
235 | _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ |
236 | |
237 | /* |
238 | * The following cases will be handled by ashr_0 |
239 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
240 | * n(0~15) n(0~15) 15(15+ n-n) ashr_0 |
241 | */ |
242 | .p2align 4 |
243 | LABEL(ashr_0): |
244 | |
245 | movdqa (%rsi), %xmm1 |
246 | pcmpeqb %xmm1, %xmm0 /* Any null chars? */ |
247 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
248 | pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ |
249 | # else |
250 | movdqa (%rdi), %xmm2 |
251 | TOLOWER (%xmm1, %xmm2) |
252 | pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ |
253 | # endif |
254 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
255 | pmovmskb %xmm1, %r9d |
256 | shr %cl, %edx /* adjust 0xffff for offset */ |
257 | shr %cl, %r9d /* adjust for 16-byte offset */ |
258 | sub %r9d, %edx |
259 | /* |
260 | * edx must be the same with r9d if in left byte (16-rcx) is equal to |
261 | * the start from (16-rax) and no null char was seen. |
262 | */ |
263 | jne LABEL(less32bytes) /* mismatch or null char */ |
264 | UPDATE_STRNCMP_COUNTER |
265 | mov $16, %rcx |
266 | mov $16, %r9 |
267 | |
268 | /* |
269 | * Now both strings are aligned at 16-byte boundary. Loop over strings |
270 | * checking 32-bytes per iteration. |
271 | */ |
272 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
273 | .p2align 4 |
274 | LABEL(ashr_0_use): |
275 | movdqa (%rdi,%rdx), %xmm0 |
276 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
277 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
278 | # else |
279 | movdqa (%rsi,%rdx), %xmm1 |
280 | TOLOWER (%xmm0, %xmm1) |
281 | pcmpistri $0x1a, %xmm1, %xmm0 |
282 | # endif |
283 | lea 16(%rdx), %rdx |
284 | jbe LABEL(ashr_0_exit_use) |
285 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
286 | sub $16, %r11 |
287 | jbe LABEL(strcmp_exitz) |
288 | # endif |
289 | |
290 | movdqa (%rdi,%rdx), %xmm0 |
291 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
292 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
293 | # else |
294 | movdqa (%rsi,%rdx), %xmm1 |
295 | TOLOWER (%xmm0, %xmm1) |
296 | pcmpistri $0x1a, %xmm1, %xmm0 |
297 | # endif |
298 | lea 16(%rdx), %rdx |
299 | jbe LABEL(ashr_0_exit_use) |
300 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
301 | sub $16, %r11 |
302 | jbe LABEL(strcmp_exitz) |
303 | # endif |
304 | jmp LABEL(ashr_0_use) |
305 | |
306 | |
307 | .p2align 4 |
308 | LABEL(ashr_0_exit_use): |
309 | jnc LABEL(strcmp_exitz) |
310 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
311 | sub %rcx, %r11 |
312 | jbe LABEL(strcmp_exitz) |
313 | # endif |
314 | lea -16(%rdx, %rcx), %rcx |
315 | movzbl (%rdi, %rcx), %eax |
316 | movzbl (%rsi, %rcx), %edx |
317 | # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
318 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
319 | movl (%rcx,%rax,4), %eax |
320 | movl (%rcx,%rdx,4), %edx |
321 | # endif |
322 | sub %edx, %eax |
323 | ret |
324 | |
325 | |
326 | |
327 | /* |
328 | * The following cases will be handled by ashr_1 |
329 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
330 | * n(15) n -15 0(15 +(n-15) - n) ashr_1 |
331 | */ |
332 | .p2align 4 |
333 | LABEL(ashr_1): |
334 | pslldq $15, %xmm2 /* shift first string to align with second */ |
335 | TOLOWER (%xmm1, %xmm2) |
336 | pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ |
337 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
338 | pmovmskb %xmm2, %r9d |
339 | shr %cl, %edx /* adjust 0xffff for offset */ |
340 | shr %cl, %r9d /* adjust for 16-byte offset */ |
341 | sub %r9d, %edx |
342 | jnz LABEL(less32bytes) /* mismatch or null char seen */ |
343 | movdqa (%rdi), %xmm3 |
344 | UPDATE_STRNCMP_COUNTER |
345 | |
346 | mov $16, %rcx /* index for loads*/ |
347 | mov $1, %r9d /* byte position left over from less32bytes case */ |
348 | /* |
349 | * Setup %r10 value allows us to detect crossing a page boundary. |
350 | * When %r10 goes positive we have crossed a page boundary and |
351 | * need to do a nibble. |
352 | */ |
353 | lea 1(%rdi), %r10 |
354 | and $0xfff, %r10 /* offset into 4K page */ |
355 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
356 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
357 | |
358 | .p2align 4 |
359 | LABEL(loop_ashr_1_use): |
360 | add $16, %r10 |
361 | jg LABEL(nibble_ashr_1_use) |
362 | |
363 | LABEL(nibble_ashr_1_restart_use): |
364 | movdqa (%rdi, %rdx), %xmm0 |
365 | palignr $1, -16(%rdi, %rdx), %xmm0 |
366 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
367 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
368 | # else |
369 | movdqa (%rsi,%rdx), %xmm1 |
370 | TOLOWER (%xmm0, %xmm1) |
371 | pcmpistri $0x1a, %xmm1, %xmm0 |
372 | # endif |
373 | jbe LABEL(exit_use) |
374 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
375 | sub $16, %r11 |
376 | jbe LABEL(strcmp_exitz) |
377 | # endif |
378 | |
379 | add $16, %rdx |
380 | add $16, %r10 |
381 | jg LABEL(nibble_ashr_1_use) |
382 | |
383 | movdqa (%rdi, %rdx), %xmm0 |
384 | palignr $1, -16(%rdi, %rdx), %xmm0 |
385 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
386 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
387 | # else |
388 | movdqa (%rsi,%rdx), %xmm1 |
389 | TOLOWER (%xmm0, %xmm1) |
390 | pcmpistri $0x1a, %xmm1, %xmm0 |
391 | # endif |
392 | jbe LABEL(exit_use) |
393 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
394 | sub $16, %r11 |
395 | jbe LABEL(strcmp_exitz) |
396 | # endif |
397 | add $16, %rdx |
398 | jmp LABEL(loop_ashr_1_use) |
399 | |
400 | .p2align 4 |
401 | LABEL(nibble_ashr_1_use): |
402 | sub $0x1000, %r10 |
403 | movdqa -16(%rdi, %rdx), %xmm0 |
404 | psrldq $1, %xmm0 |
405 | pcmpistri $0x3a,%xmm0, %xmm0 |
406 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
407 | cmp %r11, %rcx |
408 | jae LABEL(nibble_ashr_exit_use) |
409 | # endif |
410 | cmp $14, %ecx |
411 | ja LABEL(nibble_ashr_1_restart_use) |
412 | |
413 | jmp LABEL(nibble_ashr_exit_use) |
414 | |
415 | /* |
416 | * The following cases will be handled by ashr_2 |
417 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
418 | * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 |
419 | */ |
420 | .p2align 4 |
421 | LABEL(ashr_2): |
422 | pslldq $14, %xmm2 |
423 | TOLOWER (%xmm1, %xmm2) |
424 | pcmpeqb %xmm1, %xmm2 |
425 | psubb %xmm0, %xmm2 |
426 | pmovmskb %xmm2, %r9d |
427 | shr %cl, %edx |
428 | shr %cl, %r9d |
429 | sub %r9d, %edx |
430 | jnz LABEL(less32bytes) |
431 | movdqa (%rdi), %xmm3 |
432 | UPDATE_STRNCMP_COUNTER |
433 | |
434 | mov $16, %rcx /* index for loads */ |
435 | mov $2, %r9d /* byte position left over from less32bytes case */ |
436 | /* |
437 | * Setup %r10 value allows us to detect crossing a page boundary. |
438 | * When %r10 goes positive we have crossed a page boundary and |
439 | * need to do a nibble. |
440 | */ |
441 | lea 2(%rdi), %r10 |
442 | and $0xfff, %r10 /* offset into 4K page */ |
443 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
444 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
445 | |
446 | .p2align 4 |
447 | LABEL(loop_ashr_2_use): |
448 | add $16, %r10 |
449 | jg LABEL(nibble_ashr_2_use) |
450 | |
451 | LABEL(nibble_ashr_2_restart_use): |
452 | movdqa (%rdi, %rdx), %xmm0 |
453 | palignr $2, -16(%rdi, %rdx), %xmm0 |
454 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
455 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
456 | # else |
457 | movdqa (%rsi,%rdx), %xmm1 |
458 | TOLOWER (%xmm0, %xmm1) |
459 | pcmpistri $0x1a, %xmm1, %xmm0 |
460 | # endif |
461 | jbe LABEL(exit_use) |
462 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
463 | sub $16, %r11 |
464 | jbe LABEL(strcmp_exitz) |
465 | # endif |
466 | |
467 | add $16, %rdx |
468 | add $16, %r10 |
469 | jg LABEL(nibble_ashr_2_use) |
470 | |
471 | movdqa (%rdi, %rdx), %xmm0 |
472 | palignr $2, -16(%rdi, %rdx), %xmm0 |
473 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
474 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
475 | # else |
476 | movdqa (%rsi,%rdx), %xmm1 |
477 | TOLOWER (%xmm0, %xmm1) |
478 | pcmpistri $0x1a, %xmm1, %xmm0 |
479 | # endif |
480 | jbe LABEL(exit_use) |
481 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
482 | sub $16, %r11 |
483 | jbe LABEL(strcmp_exitz) |
484 | # endif |
485 | add $16, %rdx |
486 | jmp LABEL(loop_ashr_2_use) |
487 | |
488 | .p2align 4 |
489 | LABEL(nibble_ashr_2_use): |
490 | sub $0x1000, %r10 |
491 | movdqa -16(%rdi, %rdx), %xmm0 |
492 | psrldq $2, %xmm0 |
493 | pcmpistri $0x3a,%xmm0, %xmm0 |
494 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
495 | cmp %r11, %rcx |
496 | jae LABEL(nibble_ashr_exit_use) |
497 | # endif |
498 | cmp $13, %ecx |
499 | ja LABEL(nibble_ashr_2_restart_use) |
500 | |
501 | jmp LABEL(nibble_ashr_exit_use) |
502 | |
503 | /* |
504 | * The following cases will be handled by ashr_3 |
505 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
506 | * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 |
507 | */ |
508 | .p2align 4 |
509 | LABEL(ashr_3): |
510 | pslldq $13, %xmm2 |
511 | TOLOWER (%xmm1, %xmm2) |
512 | pcmpeqb %xmm1, %xmm2 |
513 | psubb %xmm0, %xmm2 |
514 | pmovmskb %xmm2, %r9d |
515 | shr %cl, %edx |
516 | shr %cl, %r9d |
517 | sub %r9d, %edx |
518 | jnz LABEL(less32bytes) |
519 | movdqa (%rdi), %xmm3 |
520 | |
521 | UPDATE_STRNCMP_COUNTER |
522 | |
523 | mov $16, %rcx /* index for loads */ |
524 | mov $3, %r9d /* byte position left over from less32bytes case */ |
525 | /* |
526 | * Setup %r10 value allows us to detect crossing a page boundary. |
527 | * When %r10 goes positive we have crossed a page boundary and |
528 | * need to do a nibble. |
529 | */ |
530 | lea 3(%rdi), %r10 |
531 | and $0xfff, %r10 /* offset into 4K page */ |
532 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
533 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
534 | |
535 | LABEL(loop_ashr_3_use): |
536 | add $16, %r10 |
537 | jg LABEL(nibble_ashr_3_use) |
538 | |
539 | LABEL(nibble_ashr_3_restart_use): |
540 | movdqa (%rdi, %rdx), %xmm0 |
541 | palignr $3, -16(%rdi, %rdx), %xmm0 |
542 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
543 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
544 | # else |
545 | movdqa (%rsi,%rdx), %xmm1 |
546 | TOLOWER (%xmm0, %xmm1) |
547 | pcmpistri $0x1a, %xmm1, %xmm0 |
548 | # endif |
549 | jbe LABEL(exit_use) |
550 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
551 | sub $16, %r11 |
552 | jbe LABEL(strcmp_exitz) |
553 | # endif |
554 | |
555 | add $16, %rdx |
556 | add $16, %r10 |
557 | jg LABEL(nibble_ashr_3_use) |
558 | |
559 | movdqa (%rdi, %rdx), %xmm0 |
560 | palignr $3, -16(%rdi, %rdx), %xmm0 |
561 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
562 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
563 | # else |
564 | movdqa (%rsi,%rdx), %xmm1 |
565 | TOLOWER (%xmm0, %xmm1) |
566 | pcmpistri $0x1a, %xmm1, %xmm0 |
567 | # endif |
568 | jbe LABEL(exit_use) |
569 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
570 | sub $16, %r11 |
571 | jbe LABEL(strcmp_exitz) |
572 | # endif |
573 | add $16, %rdx |
574 | jmp LABEL(loop_ashr_3_use) |
575 | |
576 | .p2align 4 |
577 | LABEL(nibble_ashr_3_use): |
578 | sub $0x1000, %r10 |
579 | movdqa -16(%rdi, %rdx), %xmm0 |
580 | psrldq $3, %xmm0 |
581 | pcmpistri $0x3a,%xmm0, %xmm0 |
582 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
583 | cmp %r11, %rcx |
584 | jae LABEL(nibble_ashr_exit_use) |
585 | # endif |
586 | cmp $12, %ecx |
587 | ja LABEL(nibble_ashr_3_restart_use) |
588 | |
589 | jmp LABEL(nibble_ashr_exit_use) |
590 | |
591 | /* |
592 | * The following cases will be handled by ashr_4 |
593 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
594 | * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 |
595 | */ |
596 | .p2align 4 |
597 | LABEL(ashr_4): |
598 | pslldq $12, %xmm2 |
599 | TOLOWER (%xmm1, %xmm2) |
600 | pcmpeqb %xmm1, %xmm2 |
601 | psubb %xmm0, %xmm2 |
602 | pmovmskb %xmm2, %r9d |
603 | shr %cl, %edx |
604 | shr %cl, %r9d |
605 | sub %r9d, %edx |
606 | jnz LABEL(less32bytes) |
607 | movdqa (%rdi), %xmm3 |
608 | |
609 | UPDATE_STRNCMP_COUNTER |
610 | |
611 | mov $16, %rcx /* index for loads */ |
612 | mov $4, %r9d /* byte position left over from less32bytes case */ |
613 | /* |
614 | * Setup %r10 value allows us to detect crossing a page boundary. |
615 | * When %r10 goes positive we have crossed a page boundary and |
616 | * need to do a nibble. |
617 | */ |
618 | lea 4(%rdi), %r10 |
619 | and $0xfff, %r10 /* offset into 4K page */ |
620 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
621 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
622 | |
623 | .p2align 4 |
624 | LABEL(loop_ashr_4_use): |
625 | add $16, %r10 |
626 | jg LABEL(nibble_ashr_4_use) |
627 | |
628 | LABEL(nibble_ashr_4_restart_use): |
629 | movdqa (%rdi, %rdx), %xmm0 |
630 | palignr $4, -16(%rdi, %rdx), %xmm0 |
631 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
632 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
633 | # else |
634 | movdqa (%rsi,%rdx), %xmm1 |
635 | TOLOWER (%xmm0, %xmm1) |
636 | pcmpistri $0x1a, %xmm1, %xmm0 |
637 | # endif |
638 | jbe LABEL(exit_use) |
639 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
640 | sub $16, %r11 |
641 | jbe LABEL(strcmp_exitz) |
642 | # endif |
643 | |
644 | add $16, %rdx |
645 | add $16, %r10 |
646 | jg LABEL(nibble_ashr_4_use) |
647 | |
648 | movdqa (%rdi, %rdx), %xmm0 |
649 | palignr $4, -16(%rdi, %rdx), %xmm0 |
650 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
651 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
652 | # else |
653 | movdqa (%rsi,%rdx), %xmm1 |
654 | TOLOWER (%xmm0, %xmm1) |
655 | pcmpistri $0x1a, %xmm1, %xmm0 |
656 | # endif |
657 | jbe LABEL(exit_use) |
658 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
659 | sub $16, %r11 |
660 | jbe LABEL(strcmp_exitz) |
661 | # endif |
662 | add $16, %rdx |
663 | jmp LABEL(loop_ashr_4_use) |
664 | |
665 | .p2align 4 |
666 | LABEL(nibble_ashr_4_use): |
667 | sub $0x1000, %r10 |
668 | movdqa -16(%rdi, %rdx), %xmm0 |
669 | psrldq $4, %xmm0 |
670 | pcmpistri $0x3a,%xmm0, %xmm0 |
671 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
672 | cmp %r11, %rcx |
673 | jae LABEL(nibble_ashr_exit_use) |
674 | # endif |
675 | cmp $11, %ecx |
676 | ja LABEL(nibble_ashr_4_restart_use) |
677 | |
678 | jmp LABEL(nibble_ashr_exit_use) |
679 | |
680 | /* |
681 | * The following cases will be handled by ashr_5 |
682 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
683 | * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 |
684 | */ |
685 | .p2align 4 |
686 | LABEL(ashr_5): |
687 | pslldq $11, %xmm2 |
688 | TOLOWER (%xmm1, %xmm2) |
689 | pcmpeqb %xmm1, %xmm2 |
690 | psubb %xmm0, %xmm2 |
691 | pmovmskb %xmm2, %r9d |
692 | shr %cl, %edx |
693 | shr %cl, %r9d |
694 | sub %r9d, %edx |
695 | jnz LABEL(less32bytes) |
696 | movdqa (%rdi), %xmm3 |
697 | |
698 | UPDATE_STRNCMP_COUNTER |
699 | |
700 | mov $16, %rcx /* index for loads */ |
701 | mov $5, %r9d /* byte position left over from less32bytes case */ |
702 | /* |
703 | * Setup %r10 value allows us to detect crossing a page boundary. |
704 | * When %r10 goes positive we have crossed a page boundary and |
705 | * need to do a nibble. |
706 | */ |
707 | lea 5(%rdi), %r10 |
708 | and $0xfff, %r10 /* offset into 4K page */ |
709 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
710 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
711 | |
712 | .p2align 4 |
713 | LABEL(loop_ashr_5_use): |
714 | add $16, %r10 |
715 | jg LABEL(nibble_ashr_5_use) |
716 | |
717 | LABEL(nibble_ashr_5_restart_use): |
718 | movdqa (%rdi, %rdx), %xmm0 |
719 | palignr $5, -16(%rdi, %rdx), %xmm0 |
720 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
721 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
722 | # else |
723 | movdqa (%rsi,%rdx), %xmm1 |
724 | TOLOWER (%xmm0, %xmm1) |
725 | pcmpistri $0x1a, %xmm1, %xmm0 |
726 | # endif |
727 | jbe LABEL(exit_use) |
728 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
729 | sub $16, %r11 |
730 | jbe LABEL(strcmp_exitz) |
731 | # endif |
732 | |
733 | add $16, %rdx |
734 | add $16, %r10 |
735 | jg LABEL(nibble_ashr_5_use) |
736 | |
737 | movdqa (%rdi, %rdx), %xmm0 |
738 | |
739 | palignr $5, -16(%rdi, %rdx), %xmm0 |
740 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
741 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
742 | # else |
743 | movdqa (%rsi,%rdx), %xmm1 |
744 | TOLOWER (%xmm0, %xmm1) |
745 | pcmpistri $0x1a, %xmm1, %xmm0 |
746 | # endif |
747 | jbe LABEL(exit_use) |
748 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
749 | sub $16, %r11 |
750 | jbe LABEL(strcmp_exitz) |
751 | # endif |
752 | add $16, %rdx |
753 | jmp LABEL(loop_ashr_5_use) |
754 | |
755 | .p2align 4 |
756 | LABEL(nibble_ashr_5_use): |
757 | sub $0x1000, %r10 |
758 | movdqa -16(%rdi, %rdx), %xmm0 |
759 | psrldq $5, %xmm0 |
760 | pcmpistri $0x3a,%xmm0, %xmm0 |
761 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
762 | cmp %r11, %rcx |
763 | jae LABEL(nibble_ashr_exit_use) |
764 | # endif |
765 | cmp $10, %ecx |
766 | ja LABEL(nibble_ashr_5_restart_use) |
767 | |
768 | jmp LABEL(nibble_ashr_exit_use) |
769 | |
770 | /* |
771 | * The following cases will be handled by ashr_6 |
772 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
773 | * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 |
774 | */ |
775 | .p2align 4 |
776 | LABEL(ashr_6): |
777 | pslldq $10, %xmm2 |
778 | TOLOWER (%xmm1, %xmm2) |
779 | pcmpeqb %xmm1, %xmm2 |
780 | psubb %xmm0, %xmm2 |
781 | pmovmskb %xmm2, %r9d |
782 | shr %cl, %edx |
783 | shr %cl, %r9d |
784 | sub %r9d, %edx |
785 | jnz LABEL(less32bytes) |
786 | movdqa (%rdi), %xmm3 |
787 | |
788 | UPDATE_STRNCMP_COUNTER |
789 | |
790 | mov $16, %rcx /* index for loads */ |
791 | mov $6, %r9d /* byte position left over from less32bytes case */ |
792 | /* |
793 | * Setup %r10 value allows us to detect crossing a page boundary. |
794 | * When %r10 goes positive we have crossed a page boundary and |
795 | * need to do a nibble. |
796 | */ |
797 | lea 6(%rdi), %r10 |
798 | and $0xfff, %r10 /* offset into 4K page */ |
799 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
800 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
801 | |
802 | .p2align 4 |
803 | LABEL(loop_ashr_6_use): |
804 | add $16, %r10 |
805 | jg LABEL(nibble_ashr_6_use) |
806 | |
807 | LABEL(nibble_ashr_6_restart_use): |
808 | movdqa (%rdi, %rdx), %xmm0 |
809 | palignr $6, -16(%rdi, %rdx), %xmm0 |
810 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
811 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
812 | # else |
813 | movdqa (%rsi,%rdx), %xmm1 |
814 | TOLOWER (%xmm0, %xmm1) |
815 | pcmpistri $0x1a, %xmm1, %xmm0 |
816 | # endif |
817 | jbe LABEL(exit_use) |
818 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
819 | sub $16, %r11 |
820 | jbe LABEL(strcmp_exitz) |
821 | # endif |
822 | |
823 | add $16, %rdx |
824 | add $16, %r10 |
825 | jg LABEL(nibble_ashr_6_use) |
826 | |
827 | movdqa (%rdi, %rdx), %xmm0 |
828 | palignr $6, -16(%rdi, %rdx), %xmm0 |
829 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
830 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
831 | # else |
832 | movdqa (%rsi,%rdx), %xmm1 |
833 | TOLOWER (%xmm0, %xmm1) |
834 | pcmpistri $0x1a, %xmm1, %xmm0 |
835 | # endif |
836 | jbe LABEL(exit_use) |
837 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
838 | sub $16, %r11 |
839 | jbe LABEL(strcmp_exitz) |
840 | # endif |
841 | add $16, %rdx |
842 | jmp LABEL(loop_ashr_6_use) |
843 | |
844 | .p2align 4 |
845 | LABEL(nibble_ashr_6_use): |
846 | sub $0x1000, %r10 |
847 | movdqa -16(%rdi, %rdx), %xmm0 |
848 | psrldq $6, %xmm0 |
849 | pcmpistri $0x3a,%xmm0, %xmm0 |
850 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
851 | cmp %r11, %rcx |
852 | jae LABEL(nibble_ashr_exit_use) |
853 | # endif |
854 | cmp $9, %ecx |
855 | ja LABEL(nibble_ashr_6_restart_use) |
856 | |
857 | jmp LABEL(nibble_ashr_exit_use) |
858 | |
859 | /* |
860 | * The following cases will be handled by ashr_7 |
861 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
862 | * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 |
863 | */ |
864 | .p2align 4 |
865 | LABEL(ashr_7): |
866 | pslldq $9, %xmm2 |
867 | TOLOWER (%xmm1, %xmm2) |
868 | pcmpeqb %xmm1, %xmm2 |
869 | psubb %xmm0, %xmm2 |
870 | pmovmskb %xmm2, %r9d |
871 | shr %cl, %edx |
872 | shr %cl, %r9d |
873 | sub %r9d, %edx |
874 | jnz LABEL(less32bytes) |
875 | movdqa (%rdi), %xmm3 |
876 | |
877 | UPDATE_STRNCMP_COUNTER |
878 | |
879 | mov $16, %rcx /* index for loads */ |
880 | mov $7, %r9d /* byte position left over from less32bytes case */ |
881 | /* |
882 | * Setup %r10 value allows us to detect crossing a page boundary. |
883 | * When %r10 goes positive we have crossed a page boundary and |
884 | * need to do a nibble. |
885 | */ |
886 | lea 7(%rdi), %r10 |
887 | and $0xfff, %r10 /* offset into 4K page */ |
888 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
889 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
890 | |
891 | .p2align 4 |
892 | LABEL(loop_ashr_7_use): |
893 | add $16, %r10 |
894 | jg LABEL(nibble_ashr_7_use) |
895 | |
896 | LABEL(nibble_ashr_7_restart_use): |
897 | movdqa (%rdi, %rdx), %xmm0 |
898 | palignr $7, -16(%rdi, %rdx), %xmm0 |
899 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
900 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
901 | # else |
902 | movdqa (%rsi,%rdx), %xmm1 |
903 | TOLOWER (%xmm0, %xmm1) |
904 | pcmpistri $0x1a, %xmm1, %xmm0 |
905 | # endif |
906 | jbe LABEL(exit_use) |
907 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
908 | sub $16, %r11 |
909 | jbe LABEL(strcmp_exitz) |
910 | # endif |
911 | |
912 | add $16, %rdx |
913 | add $16, %r10 |
914 | jg LABEL(nibble_ashr_7_use) |
915 | |
916 | movdqa (%rdi, %rdx), %xmm0 |
917 | palignr $7, -16(%rdi, %rdx), %xmm0 |
918 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
919 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
920 | # else |
921 | movdqa (%rsi,%rdx), %xmm1 |
922 | TOLOWER (%xmm0, %xmm1) |
923 | pcmpistri $0x1a, %xmm1, %xmm0 |
924 | # endif |
925 | jbe LABEL(exit_use) |
926 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
927 | sub $16, %r11 |
928 | jbe LABEL(strcmp_exitz) |
929 | # endif |
930 | add $16, %rdx |
931 | jmp LABEL(loop_ashr_7_use) |
932 | |
933 | .p2align 4 |
934 | LABEL(nibble_ashr_7_use): |
935 | sub $0x1000, %r10 |
936 | movdqa -16(%rdi, %rdx), %xmm0 |
937 | psrldq $7, %xmm0 |
938 | pcmpistri $0x3a,%xmm0, %xmm0 |
939 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
940 | cmp %r11, %rcx |
941 | jae LABEL(nibble_ashr_exit_use) |
942 | # endif |
943 | cmp $8, %ecx |
944 | ja LABEL(nibble_ashr_7_restart_use) |
945 | |
946 | jmp LABEL(nibble_ashr_exit_use) |
947 | |
948 | /* |
949 | * The following cases will be handled by ashr_8 |
950 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
951 | * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 |
952 | */ |
953 | .p2align 4 |
954 | LABEL(ashr_8): |
955 | pslldq $8, %xmm2 |
956 | TOLOWER (%xmm1, %xmm2) |
957 | pcmpeqb %xmm1, %xmm2 |
958 | psubb %xmm0, %xmm2 |
959 | pmovmskb %xmm2, %r9d |
960 | shr %cl, %edx |
961 | shr %cl, %r9d |
962 | sub %r9d, %edx |
963 | jnz LABEL(less32bytes) |
964 | movdqa (%rdi), %xmm3 |
965 | |
966 | UPDATE_STRNCMP_COUNTER |
967 | |
968 | mov $16, %rcx /* index for loads */ |
969 | mov $8, %r9d /* byte position left over from less32bytes case */ |
970 | /* |
971 | * Setup %r10 value allows us to detect crossing a page boundary. |
972 | * When %r10 goes positive we have crossed a page boundary and |
973 | * need to do a nibble. |
974 | */ |
975 | lea 8(%rdi), %r10 |
976 | and $0xfff, %r10 /* offset into 4K page */ |
977 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
978 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
979 | |
980 | .p2align 4 |
981 | LABEL(loop_ashr_8_use): |
982 | add $16, %r10 |
983 | jg LABEL(nibble_ashr_8_use) |
984 | |
985 | LABEL(nibble_ashr_8_restart_use): |
986 | movdqa (%rdi, %rdx), %xmm0 |
987 | palignr $8, -16(%rdi, %rdx), %xmm0 |
988 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
989 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
990 | # else |
991 | movdqa (%rsi,%rdx), %xmm1 |
992 | TOLOWER (%xmm0, %xmm1) |
993 | pcmpistri $0x1a, %xmm1, %xmm0 |
994 | # endif |
995 | jbe LABEL(exit_use) |
996 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
997 | sub $16, %r11 |
998 | jbe LABEL(strcmp_exitz) |
999 | # endif |
1000 | |
1001 | add $16, %rdx |
1002 | add $16, %r10 |
1003 | jg LABEL(nibble_ashr_8_use) |
1004 | |
1005 | movdqa (%rdi, %rdx), %xmm0 |
1006 | palignr $8, -16(%rdi, %rdx), %xmm0 |
1007 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1008 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1009 | # else |
1010 | movdqa (%rsi,%rdx), %xmm1 |
1011 | TOLOWER (%xmm0, %xmm1) |
1012 | pcmpistri $0x1a, %xmm1, %xmm0 |
1013 | # endif |
1014 | jbe LABEL(exit_use) |
1015 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1016 | sub $16, %r11 |
1017 | jbe LABEL(strcmp_exitz) |
1018 | # endif |
1019 | add $16, %rdx |
1020 | jmp LABEL(loop_ashr_8_use) |
1021 | |
1022 | .p2align 4 |
1023 | LABEL(nibble_ashr_8_use): |
1024 | sub $0x1000, %r10 |
1025 | movdqa -16(%rdi, %rdx), %xmm0 |
1026 | psrldq $8, %xmm0 |
1027 | pcmpistri $0x3a,%xmm0, %xmm0 |
1028 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1029 | cmp %r11, %rcx |
1030 | jae LABEL(nibble_ashr_exit_use) |
1031 | # endif |
1032 | cmp $7, %ecx |
1033 | ja LABEL(nibble_ashr_8_restart_use) |
1034 | |
1035 | jmp LABEL(nibble_ashr_exit_use) |
1036 | |
1037 | /* |
1038 | * The following cases will be handled by ashr_9 |
1039 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1040 | * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 |
1041 | */ |
1042 | .p2align 4 |
1043 | LABEL(ashr_9): |
1044 | pslldq $7, %xmm2 |
1045 | TOLOWER (%xmm1, %xmm2) |
1046 | pcmpeqb %xmm1, %xmm2 |
1047 | psubb %xmm0, %xmm2 |
1048 | pmovmskb %xmm2, %r9d |
1049 | shr %cl, %edx |
1050 | shr %cl, %r9d |
1051 | sub %r9d, %edx |
1052 | jnz LABEL(less32bytes) |
1053 | movdqa (%rdi), %xmm3 |
1054 | |
1055 | UPDATE_STRNCMP_COUNTER |
1056 | |
1057 | mov $16, %rcx /* index for loads */ |
1058 | mov $9, %r9d /* byte position left over from less32bytes case */ |
1059 | /* |
1060 | * Setup %r10 value allows us to detect crossing a page boundary. |
1061 | * When %r10 goes positive we have crossed a page boundary and |
1062 | * need to do a nibble. |
1063 | */ |
1064 | lea 9(%rdi), %r10 |
1065 | and $0xfff, %r10 /* offset into 4K page */ |
1066 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1067 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1068 | |
1069 | .p2align 4 |
1070 | LABEL(loop_ashr_9_use): |
1071 | add $16, %r10 |
1072 | jg LABEL(nibble_ashr_9_use) |
1073 | |
1074 | LABEL(nibble_ashr_9_restart_use): |
1075 | movdqa (%rdi, %rdx), %xmm0 |
1076 | |
1077 | palignr $9, -16(%rdi, %rdx), %xmm0 |
1078 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1079 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1080 | # else |
1081 | movdqa (%rsi,%rdx), %xmm1 |
1082 | TOLOWER (%xmm0, %xmm1) |
1083 | pcmpistri $0x1a, %xmm1, %xmm0 |
1084 | # endif |
1085 | jbe LABEL(exit_use) |
1086 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1087 | sub $16, %r11 |
1088 | jbe LABEL(strcmp_exitz) |
1089 | # endif |
1090 | |
1091 | add $16, %rdx |
1092 | add $16, %r10 |
1093 | jg LABEL(nibble_ashr_9_use) |
1094 | |
1095 | movdqa (%rdi, %rdx), %xmm0 |
1096 | palignr $9, -16(%rdi, %rdx), %xmm0 |
1097 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1098 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1099 | # else |
1100 | movdqa (%rsi,%rdx), %xmm1 |
1101 | TOLOWER (%xmm0, %xmm1) |
1102 | pcmpistri $0x1a, %xmm1, %xmm0 |
1103 | # endif |
1104 | jbe LABEL(exit_use) |
1105 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1106 | sub $16, %r11 |
1107 | jbe LABEL(strcmp_exitz) |
1108 | # endif |
1109 | add $16, %rdx |
1110 | jmp LABEL(loop_ashr_9_use) |
1111 | |
1112 | .p2align 4 |
1113 | LABEL(nibble_ashr_9_use): |
1114 | sub $0x1000, %r10 |
1115 | movdqa -16(%rdi, %rdx), %xmm0 |
1116 | psrldq $9, %xmm0 |
1117 | pcmpistri $0x3a,%xmm0, %xmm0 |
1118 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1119 | cmp %r11, %rcx |
1120 | jae LABEL(nibble_ashr_exit_use) |
1121 | # endif |
1122 | cmp $6, %ecx |
1123 | ja LABEL(nibble_ashr_9_restart_use) |
1124 | |
1125 | jmp LABEL(nibble_ashr_exit_use) |
1126 | |
1127 | /* |
1128 | * The following cases will be handled by ashr_10 |
1129 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1130 | * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 |
1131 | */ |
1132 | .p2align 4 |
1133 | LABEL(ashr_10): |
1134 | pslldq $6, %xmm2 |
1135 | TOLOWER (%xmm1, %xmm2) |
1136 | pcmpeqb %xmm1, %xmm2 |
1137 | psubb %xmm0, %xmm2 |
1138 | pmovmskb %xmm2, %r9d |
1139 | shr %cl, %edx |
1140 | shr %cl, %r9d |
1141 | sub %r9d, %edx |
1142 | jnz LABEL(less32bytes) |
1143 | movdqa (%rdi), %xmm3 |
1144 | |
1145 | UPDATE_STRNCMP_COUNTER |
1146 | |
1147 | mov $16, %rcx /* index for loads */ |
1148 | mov $10, %r9d /* byte position left over from less32bytes case */ |
1149 | /* |
1150 | * Setup %r10 value allows us to detect crossing a page boundary. |
1151 | * When %r10 goes positive we have crossed a page boundary and |
1152 | * need to do a nibble. |
1153 | */ |
1154 | lea 10(%rdi), %r10 |
1155 | and $0xfff, %r10 /* offset into 4K page */ |
1156 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1157 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1158 | |
1159 | .p2align 4 |
1160 | LABEL(loop_ashr_10_use): |
1161 | add $16, %r10 |
1162 | jg LABEL(nibble_ashr_10_use) |
1163 | |
1164 | LABEL(nibble_ashr_10_restart_use): |
1165 | movdqa (%rdi, %rdx), %xmm0 |
1166 | palignr $10, -16(%rdi, %rdx), %xmm0 |
1167 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1168 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1169 | # else |
1170 | movdqa (%rsi,%rdx), %xmm1 |
1171 | TOLOWER (%xmm0, %xmm1) |
1172 | pcmpistri $0x1a, %xmm1, %xmm0 |
1173 | # endif |
1174 | jbe LABEL(exit_use) |
1175 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1176 | sub $16, %r11 |
1177 | jbe LABEL(strcmp_exitz) |
1178 | # endif |
1179 | |
1180 | add $16, %rdx |
1181 | add $16, %r10 |
1182 | jg LABEL(nibble_ashr_10_use) |
1183 | |
1184 | movdqa (%rdi, %rdx), %xmm0 |
1185 | palignr $10, -16(%rdi, %rdx), %xmm0 |
1186 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1187 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1188 | # else |
1189 | movdqa (%rsi,%rdx), %xmm1 |
1190 | TOLOWER (%xmm0, %xmm1) |
1191 | pcmpistri $0x1a, %xmm1, %xmm0 |
1192 | # endif |
1193 | jbe LABEL(exit_use) |
1194 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1195 | sub $16, %r11 |
1196 | jbe LABEL(strcmp_exitz) |
1197 | # endif |
1198 | add $16, %rdx |
1199 | jmp LABEL(loop_ashr_10_use) |
1200 | |
1201 | .p2align 4 |
1202 | LABEL(nibble_ashr_10_use): |
1203 | sub $0x1000, %r10 |
1204 | movdqa -16(%rdi, %rdx), %xmm0 |
1205 | psrldq $10, %xmm0 |
1206 | pcmpistri $0x3a,%xmm0, %xmm0 |
1207 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1208 | cmp %r11, %rcx |
1209 | jae LABEL(nibble_ashr_exit_use) |
1210 | # endif |
1211 | cmp $5, %ecx |
1212 | ja LABEL(nibble_ashr_10_restart_use) |
1213 | |
1214 | jmp LABEL(nibble_ashr_exit_use) |
1215 | |
1216 | /* |
1217 | * The following cases will be handled by ashr_11 |
1218 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1219 | * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 |
1220 | */ |
1221 | .p2align 4 |
1222 | LABEL(ashr_11): |
1223 | pslldq $5, %xmm2 |
1224 | TOLOWER (%xmm1, %xmm2) |
1225 | pcmpeqb %xmm1, %xmm2 |
1226 | psubb %xmm0, %xmm2 |
1227 | pmovmskb %xmm2, %r9d |
1228 | shr %cl, %edx |
1229 | shr %cl, %r9d |
1230 | sub %r9d, %edx |
1231 | jnz LABEL(less32bytes) |
1232 | movdqa (%rdi), %xmm3 |
1233 | |
1234 | UPDATE_STRNCMP_COUNTER |
1235 | |
1236 | mov $16, %rcx /* index for loads */ |
1237 | mov $11, %r9d /* byte position left over from less32bytes case */ |
1238 | /* |
1239 | * Setup %r10 value allows us to detect crossing a page boundary. |
1240 | * When %r10 goes positive we have crossed a page boundary and |
1241 | * need to do a nibble. |
1242 | */ |
1243 | lea 11(%rdi), %r10 |
1244 | and $0xfff, %r10 /* offset into 4K page */ |
1245 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1246 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1247 | |
1248 | .p2align 4 |
1249 | LABEL(loop_ashr_11_use): |
1250 | add $16, %r10 |
1251 | jg LABEL(nibble_ashr_11_use) |
1252 | |
1253 | LABEL(nibble_ashr_11_restart_use): |
1254 | movdqa (%rdi, %rdx), %xmm0 |
1255 | palignr $11, -16(%rdi, %rdx), %xmm0 |
1256 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1257 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1258 | # else |
1259 | movdqa (%rsi,%rdx), %xmm1 |
1260 | TOLOWER (%xmm0, %xmm1) |
1261 | pcmpistri $0x1a, %xmm1, %xmm0 |
1262 | # endif |
1263 | jbe LABEL(exit_use) |
1264 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1265 | sub $16, %r11 |
1266 | jbe LABEL(strcmp_exitz) |
1267 | # endif |
1268 | |
1269 | add $16, %rdx |
1270 | add $16, %r10 |
1271 | jg LABEL(nibble_ashr_11_use) |
1272 | |
1273 | movdqa (%rdi, %rdx), %xmm0 |
1274 | palignr $11, -16(%rdi, %rdx), %xmm0 |
1275 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1276 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1277 | # else |
1278 | movdqa (%rsi,%rdx), %xmm1 |
1279 | TOLOWER (%xmm0, %xmm1) |
1280 | pcmpistri $0x1a, %xmm1, %xmm0 |
1281 | # endif |
1282 | jbe LABEL(exit_use) |
1283 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1284 | sub $16, %r11 |
1285 | jbe LABEL(strcmp_exitz) |
1286 | # endif |
1287 | add $16, %rdx |
1288 | jmp LABEL(loop_ashr_11_use) |
1289 | |
1290 | .p2align 4 |
1291 | LABEL(nibble_ashr_11_use): |
1292 | sub $0x1000, %r10 |
1293 | movdqa -16(%rdi, %rdx), %xmm0 |
1294 | psrldq $11, %xmm0 |
1295 | pcmpistri $0x3a,%xmm0, %xmm0 |
1296 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1297 | cmp %r11, %rcx |
1298 | jae LABEL(nibble_ashr_exit_use) |
1299 | # endif |
1300 | cmp $4, %ecx |
1301 | ja LABEL(nibble_ashr_11_restart_use) |
1302 | |
1303 | jmp LABEL(nibble_ashr_exit_use) |
1304 | |
1305 | /* |
1306 | * The following cases will be handled by ashr_12 |
1307 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1308 | * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 |
1309 | */ |
1310 | .p2align 4 |
1311 | LABEL(ashr_12): |
1312 | pslldq $4, %xmm2 |
1313 | TOLOWER (%xmm1, %xmm2) |
1314 | pcmpeqb %xmm1, %xmm2 |
1315 | psubb %xmm0, %xmm2 |
1316 | pmovmskb %xmm2, %r9d |
1317 | shr %cl, %edx |
1318 | shr %cl, %r9d |
1319 | sub %r9d, %edx |
1320 | jnz LABEL(less32bytes) |
1321 | movdqa (%rdi), %xmm3 |
1322 | |
1323 | UPDATE_STRNCMP_COUNTER |
1324 | |
1325 | mov $16, %rcx /* index for loads */ |
1326 | mov $12, %r9d /* byte position left over from less32bytes case */ |
1327 | /* |
1328 | * Setup %r10 value allows us to detect crossing a page boundary. |
1329 | * When %r10 goes positive we have crossed a page boundary and |
1330 | * need to do a nibble. |
1331 | */ |
1332 | lea 12(%rdi), %r10 |
1333 | and $0xfff, %r10 /* offset into 4K page */ |
1334 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1335 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1336 | |
1337 | .p2align 4 |
1338 | LABEL(loop_ashr_12_use): |
1339 | add $16, %r10 |
1340 | jg LABEL(nibble_ashr_12_use) |
1341 | |
1342 | LABEL(nibble_ashr_12_restart_use): |
1343 | movdqa (%rdi, %rdx), %xmm0 |
1344 | palignr $12, -16(%rdi, %rdx), %xmm0 |
1345 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1346 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1347 | # else |
1348 | movdqa (%rsi,%rdx), %xmm1 |
1349 | TOLOWER (%xmm0, %xmm1) |
1350 | pcmpistri $0x1a, %xmm1, %xmm0 |
1351 | # endif |
1352 | jbe LABEL(exit_use) |
1353 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1354 | sub $16, %r11 |
1355 | jbe LABEL(strcmp_exitz) |
1356 | # endif |
1357 | |
1358 | add $16, %rdx |
1359 | add $16, %r10 |
1360 | jg LABEL(nibble_ashr_12_use) |
1361 | |
1362 | movdqa (%rdi, %rdx), %xmm0 |
1363 | palignr $12, -16(%rdi, %rdx), %xmm0 |
1364 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1365 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1366 | # else |
1367 | movdqa (%rsi,%rdx), %xmm1 |
1368 | TOLOWER (%xmm0, %xmm1) |
1369 | pcmpistri $0x1a, %xmm1, %xmm0 |
1370 | # endif |
1371 | jbe LABEL(exit_use) |
1372 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1373 | sub $16, %r11 |
1374 | jbe LABEL(strcmp_exitz) |
1375 | # endif |
1376 | add $16, %rdx |
1377 | jmp LABEL(loop_ashr_12_use) |
1378 | |
1379 | .p2align 4 |
1380 | LABEL(nibble_ashr_12_use): |
1381 | sub $0x1000, %r10 |
1382 | movdqa -16(%rdi, %rdx), %xmm0 |
1383 | psrldq $12, %xmm0 |
1384 | pcmpistri $0x3a,%xmm0, %xmm0 |
1385 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1386 | cmp %r11, %rcx |
1387 | jae LABEL(nibble_ashr_exit_use) |
1388 | # endif |
1389 | cmp $3, %ecx |
1390 | ja LABEL(nibble_ashr_12_restart_use) |
1391 | |
1392 | jmp LABEL(nibble_ashr_exit_use) |
1393 | |
1394 | /* |
1395 | * The following cases will be handled by ashr_13 |
1396 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1397 | * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 |
1398 | */ |
1399 | .p2align 4 |
1400 | LABEL(ashr_13): |
1401 | pslldq $3, %xmm2 |
1402 | TOLOWER (%xmm1, %xmm2) |
1403 | pcmpeqb %xmm1, %xmm2 |
1404 | psubb %xmm0, %xmm2 |
1405 | pmovmskb %xmm2, %r9d |
1406 | shr %cl, %edx |
1407 | shr %cl, %r9d |
1408 | sub %r9d, %edx |
1409 | jnz LABEL(less32bytes) |
1410 | movdqa (%rdi), %xmm3 |
1411 | |
1412 | UPDATE_STRNCMP_COUNTER |
1413 | |
1414 | mov $16, %rcx /* index for loads */ |
1415 | mov $13, %r9d /* byte position left over from less32bytes case */ |
1416 | /* |
1417 | * Setup %r10 value allows us to detect crossing a page boundary. |
1418 | * When %r10 goes positive we have crossed a page boundary and |
1419 | * need to do a nibble. |
1420 | */ |
1421 | lea 13(%rdi), %r10 |
1422 | and $0xfff, %r10 /* offset into 4K page */ |
1423 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1424 | |
1425 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1426 | |
1427 | .p2align 4 |
1428 | LABEL(loop_ashr_13_use): |
1429 | add $16, %r10 |
1430 | jg LABEL(nibble_ashr_13_use) |
1431 | |
1432 | LABEL(nibble_ashr_13_restart_use): |
1433 | movdqa (%rdi, %rdx), %xmm0 |
1434 | palignr $13, -16(%rdi, %rdx), %xmm0 |
1435 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1436 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1437 | # else |
1438 | movdqa (%rsi,%rdx), %xmm1 |
1439 | TOLOWER (%xmm0, %xmm1) |
1440 | pcmpistri $0x1a, %xmm1, %xmm0 |
1441 | # endif |
1442 | jbe LABEL(exit_use) |
1443 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1444 | sub $16, %r11 |
1445 | jbe LABEL(strcmp_exitz) |
1446 | # endif |
1447 | |
1448 | add $16, %rdx |
1449 | add $16, %r10 |
1450 | jg LABEL(nibble_ashr_13_use) |
1451 | |
1452 | movdqa (%rdi, %rdx), %xmm0 |
1453 | palignr $13, -16(%rdi, %rdx), %xmm0 |
1454 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1455 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1456 | # else |
1457 | movdqa (%rsi,%rdx), %xmm1 |
1458 | TOLOWER (%xmm0, %xmm1) |
1459 | pcmpistri $0x1a, %xmm1, %xmm0 |
1460 | # endif |
1461 | jbe LABEL(exit_use) |
1462 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1463 | sub $16, %r11 |
1464 | jbe LABEL(strcmp_exitz) |
1465 | # endif |
1466 | add $16, %rdx |
1467 | jmp LABEL(loop_ashr_13_use) |
1468 | |
1469 | .p2align 4 |
1470 | LABEL(nibble_ashr_13_use): |
1471 | sub $0x1000, %r10 |
1472 | movdqa -16(%rdi, %rdx), %xmm0 |
1473 | psrldq $13, %xmm0 |
1474 | pcmpistri $0x3a,%xmm0, %xmm0 |
1475 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1476 | cmp %r11, %rcx |
1477 | jae LABEL(nibble_ashr_exit_use) |
1478 | # endif |
1479 | cmp $2, %ecx |
1480 | ja LABEL(nibble_ashr_13_restart_use) |
1481 | |
1482 | jmp LABEL(nibble_ashr_exit_use) |
1483 | |
1484 | /* |
1485 | * The following cases will be handled by ashr_14 |
1486 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1487 | * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 |
1488 | */ |
1489 | .p2align 4 |
1490 | LABEL(ashr_14): |
1491 | pslldq $2, %xmm2 |
1492 | TOLOWER (%xmm1, %xmm2) |
1493 | pcmpeqb %xmm1, %xmm2 |
1494 | psubb %xmm0, %xmm2 |
1495 | pmovmskb %xmm2, %r9d |
1496 | shr %cl, %edx |
1497 | shr %cl, %r9d |
1498 | sub %r9d, %edx |
1499 | jnz LABEL(less32bytes) |
1500 | movdqa (%rdi), %xmm3 |
1501 | |
1502 | UPDATE_STRNCMP_COUNTER |
1503 | |
1504 | mov $16, %rcx /* index for loads */ |
1505 | mov $14, %r9d /* byte position left over from less32bytes case */ |
1506 | /* |
1507 | * Setup %r10 value allows us to detect crossing a page boundary. |
1508 | * When %r10 goes positive we have crossed a page boundary and |
1509 | * need to do a nibble. |
1510 | */ |
1511 | lea 14(%rdi), %r10 |
1512 | and $0xfff, %r10 /* offset into 4K page */ |
1513 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1514 | |
1515 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1516 | |
1517 | .p2align 4 |
1518 | LABEL(loop_ashr_14_use): |
1519 | add $16, %r10 |
1520 | jg LABEL(nibble_ashr_14_use) |
1521 | |
1522 | LABEL(nibble_ashr_14_restart_use): |
1523 | movdqa (%rdi, %rdx), %xmm0 |
1524 | palignr $14, -16(%rdi, %rdx), %xmm0 |
1525 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1526 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1527 | # else |
1528 | movdqa (%rsi,%rdx), %xmm1 |
1529 | TOLOWER (%xmm0, %xmm1) |
1530 | pcmpistri $0x1a, %xmm1, %xmm0 |
1531 | # endif |
1532 | jbe LABEL(exit_use) |
1533 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1534 | sub $16, %r11 |
1535 | jbe LABEL(strcmp_exitz) |
1536 | # endif |
1537 | |
1538 | add $16, %rdx |
1539 | add $16, %r10 |
1540 | jg LABEL(nibble_ashr_14_use) |
1541 | |
1542 | movdqa (%rdi, %rdx), %xmm0 |
1543 | palignr $14, -16(%rdi, %rdx), %xmm0 |
1544 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1545 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1546 | # else |
1547 | movdqa (%rsi,%rdx), %xmm1 |
1548 | TOLOWER (%xmm0, %xmm1) |
1549 | pcmpistri $0x1a, %xmm1, %xmm0 |
1550 | # endif |
1551 | jbe LABEL(exit_use) |
1552 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1553 | sub $16, %r11 |
1554 | jbe LABEL(strcmp_exitz) |
1555 | # endif |
1556 | add $16, %rdx |
1557 | jmp LABEL(loop_ashr_14_use) |
1558 | |
1559 | .p2align 4 |
1560 | LABEL(nibble_ashr_14_use): |
1561 | sub $0x1000, %r10 |
1562 | movdqa -16(%rdi, %rdx), %xmm0 |
1563 | psrldq $14, %xmm0 |
1564 | pcmpistri $0x3a,%xmm0, %xmm0 |
1565 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1566 | cmp %r11, %rcx |
1567 | jae LABEL(nibble_ashr_exit_use) |
1568 | # endif |
1569 | cmp $1, %ecx |
1570 | ja LABEL(nibble_ashr_14_restart_use) |
1571 | |
1572 | jmp LABEL(nibble_ashr_exit_use) |
1573 | |
1574 | /* |
1575 | * The following cases will be handled by ashr_15 |
1576 | * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
1577 | * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 |
1578 | */ |
1579 | .p2align 4 |
1580 | LABEL(ashr_15): |
1581 | pslldq $1, %xmm2 |
1582 | TOLOWER (%xmm1, %xmm2) |
1583 | pcmpeqb %xmm1, %xmm2 |
1584 | psubb %xmm0, %xmm2 |
1585 | pmovmskb %xmm2, %r9d |
1586 | shr %cl, %edx |
1587 | shr %cl, %r9d |
1588 | sub %r9d, %edx |
1589 | jnz LABEL(less32bytes) |
1590 | |
1591 | movdqa (%rdi), %xmm3 |
1592 | |
1593 | UPDATE_STRNCMP_COUNTER |
1594 | |
1595 | mov $16, %rcx /* index for loads */ |
1596 | mov $15, %r9d /* byte position left over from less32bytes case */ |
1597 | /* |
1598 | * Setup %r10 value allows us to detect crossing a page boundary. |
1599 | * When %r10 goes positive we have crossed a page boundary and |
1600 | * need to do a nibble. |
1601 | */ |
1602 | lea 15(%rdi), %r10 |
1603 | and $0xfff, %r10 /* offset into 4K page */ |
1604 | |
1605 | sub $0x1000, %r10 /* subtract 4K pagesize */ |
1606 | |
1607 | mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ |
1608 | |
1609 | .p2align 4 |
1610 | LABEL(loop_ashr_15_use): |
1611 | add $16, %r10 |
1612 | jg LABEL(nibble_ashr_15_use) |
1613 | |
1614 | LABEL(nibble_ashr_15_restart_use): |
1615 | movdqa (%rdi, %rdx), %xmm0 |
1616 | palignr $15, -16(%rdi, %rdx), %xmm0 |
1617 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1618 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1619 | # else |
1620 | movdqa (%rsi,%rdx), %xmm1 |
1621 | TOLOWER (%xmm0, %xmm1) |
1622 | pcmpistri $0x1a, %xmm1, %xmm0 |
1623 | # endif |
1624 | jbe LABEL(exit_use) |
1625 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1626 | sub $16, %r11 |
1627 | jbe LABEL(strcmp_exitz) |
1628 | # endif |
1629 | |
1630 | add $16, %rdx |
1631 | add $16, %r10 |
1632 | jg LABEL(nibble_ashr_15_use) |
1633 | |
1634 | movdqa (%rdi, %rdx), %xmm0 |
1635 | palignr $15, -16(%rdi, %rdx), %xmm0 |
1636 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1637 | pcmpistri $0x1a, (%rsi,%rdx), %xmm0 |
1638 | # else |
1639 | movdqa (%rsi,%rdx), %xmm1 |
1640 | TOLOWER (%xmm0, %xmm1) |
1641 | pcmpistri $0x1a, %xmm1, %xmm0 |
1642 | # endif |
1643 | jbe LABEL(exit_use) |
1644 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1645 | sub $16, %r11 |
1646 | jbe LABEL(strcmp_exitz) |
1647 | # endif |
1648 | add $16, %rdx |
1649 | jmp LABEL(loop_ashr_15_use) |
1650 | |
1651 | .p2align 4 |
1652 | LABEL(nibble_ashr_15_use): |
1653 | sub $0x1000, %r10 |
1654 | movdqa -16(%rdi, %rdx), %xmm0 |
1655 | psrldq $15, %xmm0 |
1656 | pcmpistri $0x3a,%xmm0, %xmm0 |
1657 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1658 | cmp %r11, %rcx |
1659 | jae LABEL(nibble_ashr_exit_use) |
1660 | # endif |
1661 | cmp $0, %ecx |
1662 | ja LABEL(nibble_ashr_15_restart_use) |
1663 | |
1664 | LABEL(nibble_ashr_exit_use): |
1665 | # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L |
1666 | pcmpistri $0x1a,(%rsi,%rdx), %xmm0 |
1667 | # else |
1668 | movdqa (%rsi,%rdx), %xmm1 |
1669 | TOLOWER (%xmm0, %xmm1) |
1670 | pcmpistri $0x1a, %xmm1, %xmm0 |
1671 | # endif |
1672 | .p2align 4 |
1673 | LABEL(exit_use): |
1674 | jnc LABEL(strcmp_exitz) |
1675 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1676 | sub %rcx, %r11 |
1677 | jbe LABEL(strcmp_exitz) |
1678 | # endif |
1679 | add %rcx, %rdx |
1680 | lea -16(%rdi, %r9), %rdi |
1681 | movzbl (%rdi, %rdx), %eax |
1682 | movzbl (%rsi, %rdx), %edx |
1683 | test %r8d, %r8d |
1684 | jz LABEL(ret_use) |
1685 | xchg %eax, %edx |
1686 | LABEL(ret_use): |
1687 | # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1688 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx |
1689 | movl (%rcx,%rdx,4), %edx |
1690 | movl (%rcx,%rax,4), %eax |
1691 | # endif |
1692 | |
1693 | sub %edx, %eax |
1694 | ret |
1695 | |
1696 | LABEL(less32bytes): |
1697 | lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ |
1698 | lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ |
1699 | test %r8d, %r8d |
1700 | jz LABEL(ret) |
1701 | xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ |
1702 | |
1703 | .p2align 4 |
1704 | LABEL(ret): |
1705 | LABEL(less16bytes): |
1706 | bsf %rdx, %rdx /* find and store bit index in %rdx */ |
1707 | |
1708 | # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L |
1709 | sub %rdx, %r11 |
1710 | jbe LABEL(strcmp_exitz) |
1711 | # endif |
1712 | movzbl (%rsi, %rdx), %ecx |
1713 | movzbl (%rdi, %rdx), %eax |
1714 | |
1715 | # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1716 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1717 | movl (%rdx,%rcx,4), %ecx |
1718 | movl (%rdx,%rax,4), %eax |
1719 | # endif |
1720 | |
1721 | sub %ecx, %eax |
1722 | ret |
1723 | |
1724 | LABEL(strcmp_exitz): |
1725 | xor %eax, %eax |
1726 | ret |
1727 | |
1728 | .p2align 4 |
1729 | // XXX Same as code above |
1730 | LABEL(Byte0): |
1731 | movzbl (%rsi), %ecx |
1732 | movzbl (%rdi), %eax |
1733 | |
1734 | # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
1735 | leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx |
1736 | movl (%rdx,%rcx,4), %ecx |
1737 | movl (%rdx,%rax,4), %eax |
1738 | # endif |
1739 | |
1740 | sub %ecx, %eax |
1741 | ret |
1742 | cfi_endproc |
1743 | .size STRCMP, .-STRCMP |
1744 | |
1745 | # undef UCLOW_reg |
1746 | # undef UCHIGH_reg |
1747 | # undef LCQWORD_reg |
1748 | # undef TOLOWER |
1749 | |
1750 | /* Put all SSE 4.2 functions together. */ |
1751 | .section .rodata.SECTION,"a" ,@progbits |
1752 | .p2align 3 |
1753 | LABEL(unaligned_table): |
1754 | .int LABEL(ashr_1) - LABEL(unaligned_table) |
1755 | .int LABEL(ashr_2) - LABEL(unaligned_table) |
1756 | .int LABEL(ashr_3) - LABEL(unaligned_table) |
1757 | .int LABEL(ashr_4) - LABEL(unaligned_table) |
1758 | .int LABEL(ashr_5) - LABEL(unaligned_table) |
1759 | .int LABEL(ashr_6) - LABEL(unaligned_table) |
1760 | .int LABEL(ashr_7) - LABEL(unaligned_table) |
1761 | .int LABEL(ashr_8) - LABEL(unaligned_table) |
1762 | .int LABEL(ashr_9) - LABEL(unaligned_table) |
1763 | .int LABEL(ashr_10) - LABEL(unaligned_table) |
1764 | .int LABEL(ashr_11) - LABEL(unaligned_table) |
1765 | .int LABEL(ashr_12) - LABEL(unaligned_table) |
1766 | .int LABEL(ashr_13) - LABEL(unaligned_table) |
1767 | .int LABEL(ashr_14) - LABEL(unaligned_table) |
1768 | .int LABEL(ashr_15) - LABEL(unaligned_table) |
1769 | .int LABEL(ashr_0) - LABEL(unaligned_table) |
1770 | |
1771 | # undef LABEL |
1772 | # undef SECTION |
1773 | # undef movdqa |
1774 | # undef movdqu |
1775 | # undef pmovmskb |
1776 | # undef pcmpistri |
1777 | # undef psubb |
1778 | # undef pcmpeqb |
1779 | # undef psrldq |
1780 | # undef pslldq |
1781 | # undef palignr |
1782 | # undef pxor |
1783 | # undef D |
1784 | #endif |
1785 | |