| 1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. | 
|---|
| 2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. | 
|---|
| 3 | This file is part of the GNU C Library. | 
|---|
| 4 |  | 
|---|
| 5 | The GNU C Library is free software; you can redistribute it and/or | 
|---|
| 6 | modify it under the terms of the GNU Lesser General Public | 
|---|
| 7 | License as published by the Free Software Foundation; either | 
|---|
| 8 | version 2.1 of the License, or (at your option) any later version. | 
|---|
| 9 |  | 
|---|
| 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|---|
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|---|
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|---|
| 13 | Lesser General Public License for more details. | 
|---|
| 14 |  | 
|---|
| 15 | You should have received a copy of the GNU Lesser General Public | 
|---|
| 16 | License along with the GNU C Library; if not, see | 
|---|
| 17 | <https://www.gnu.org/licenses/>.  */ | 
|---|
| 18 |  | 
|---|
| 19 | #include <isa-level.h> | 
|---|
| 20 |  | 
|---|
| 21 | #if ISA_SHOULD_BUILD (4) | 
|---|
| 22 |  | 
|---|
| 23 | # define STRCMP_ISA	_evex | 
|---|
| 24 | # include "strcmp-naming.h" | 
|---|
| 25 |  | 
|---|
| 26 | # include <sysdep.h> | 
|---|
| 27 | # if defined USE_AS_STRCASECMP_L | 
|---|
| 28 | #  include "locale-defines.h" | 
|---|
| 29 | # endif | 
|---|
| 30 |  | 
|---|
| 31 | # ifndef STRCMP | 
|---|
| 32 | #  define STRCMP	__strcmp_evex | 
|---|
| 33 | # endif | 
|---|
| 34 |  | 
|---|
| 35 | # define PAGE_SIZE	4096 | 
|---|
| 36 |  | 
|---|
| 37 | /* VEC_SIZE = Number of bytes in a ymm register.  */ | 
|---|
| 38 | # define VEC_SIZE	32 | 
|---|
| 39 | # define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR) | 
|---|
| 40 |  | 
|---|
| 41 | # define VMOVU	vmovdqu64 | 
|---|
| 42 | # define VMOVA	vmovdqa64 | 
|---|
| 43 |  | 
|---|
| 44 | # ifdef USE_AS_WCSCMP | 
|---|
| 45 | #  define TESTEQ	subl $0xff, | 
|---|
| 46 | /* Compare packed dwords.  */ | 
|---|
| 47 | #  define VPCMP	vpcmpd | 
|---|
| 48 | #  define VPMINU	vpminud | 
|---|
| 49 | #  define VPTESTM	vptestmd | 
|---|
| 50 | #  define VPTESTNM	vptestnmd | 
|---|
| 51 | /* 1 dword char == 4 bytes.  */ | 
|---|
| 52 | #  define SIZE_OF_CHAR	4 | 
|---|
| 53 | # else | 
|---|
| 54 | #  define TESTEQ	incl | 
|---|
| 55 | /* Compare packed bytes.  */ | 
|---|
| 56 | #  define VPCMP	vpcmpb | 
|---|
| 57 | #  define VPMINU	vpminub | 
|---|
| 58 | #  define VPTESTM	vptestmb | 
|---|
| 59 | #  define VPTESTNM	vptestnmb | 
|---|
| 60 | /* 1 byte char == 1 byte.  */ | 
|---|
| 61 | #  define SIZE_OF_CHAR	1 | 
|---|
| 62 | # endif | 
|---|
| 63 |  | 
|---|
| 64 | # ifdef USE_AS_STRNCMP | 
|---|
| 65 | #  define LOOP_REG	r9d | 
|---|
| 66 | #  define LOOP_REG64	r9 | 
|---|
| 67 |  | 
|---|
| 68 | #  define OFFSET_REG8	r9b | 
|---|
| 69 | #  define OFFSET_REG	r9d | 
|---|
| 70 | #  define OFFSET_REG64	r9 | 
|---|
| 71 | # else | 
|---|
| 72 | #  define LOOP_REG	edx | 
|---|
| 73 | #  define LOOP_REG64	rdx | 
|---|
| 74 |  | 
|---|
| 75 | #  define OFFSET_REG8	dl | 
|---|
| 76 | #  define OFFSET_REG	edx | 
|---|
| 77 | #  define OFFSET_REG64	rdx | 
|---|
| 78 | # endif | 
|---|
| 79 |  | 
|---|
| 80 | # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP | 
|---|
| 81 | #  define VEC_OFFSET	0 | 
|---|
| 82 | # else | 
|---|
| 83 | #  define VEC_OFFSET	(-VEC_SIZE) | 
|---|
| 84 | # endif | 
|---|
| 85 |  | 
|---|
| 86 | # define XMM0	xmm17 | 
|---|
| 87 | # define XMM1	xmm18 | 
|---|
| 88 |  | 
|---|
| 89 | # define XMM10	xmm27 | 
|---|
| 90 | # define XMM11	xmm28 | 
|---|
| 91 | # define XMM12	xmm29 | 
|---|
| 92 | # define XMM13	xmm30 | 
|---|
| 93 | # define XMM14	xmm31 | 
|---|
| 94 |  | 
|---|
| 95 |  | 
|---|
| 96 | # define YMM0	ymm17 | 
|---|
| 97 | # define YMM1	ymm18 | 
|---|
| 98 | # define YMM2	ymm19 | 
|---|
| 99 | # define YMM3	ymm20 | 
|---|
| 100 | # define YMM4	ymm21 | 
|---|
| 101 | # define YMM5	ymm22 | 
|---|
| 102 | # define YMM6	ymm23 | 
|---|
| 103 | # define YMM7	ymm24 | 
|---|
| 104 | # define YMM8	ymm25 | 
|---|
| 105 | # define YMM9	ymm26 | 
|---|
| 106 | # define YMM10	ymm27 | 
|---|
| 107 | # define YMM11	ymm28 | 
|---|
| 108 | # define YMM12	ymm29 | 
|---|
| 109 | # define YMM13	ymm30 | 
|---|
| 110 | # define YMM14	ymm31 | 
|---|
| 111 |  | 
|---|
| 112 | # ifdef USE_AS_STRCASECMP_L | 
|---|
| 113 | #  define BYTE_LOOP_REG	OFFSET_REG | 
|---|
| 114 | # else | 
|---|
| 115 | #  define BYTE_LOOP_REG	ecx | 
|---|
| 116 | # endif | 
|---|
| 117 |  | 
|---|
| 118 | # ifdef USE_AS_STRCASECMP_L | 
|---|
| 119 | #  ifdef USE_AS_STRNCMP | 
|---|
| 120 | #   define LOCALE_REG	rcx | 
|---|
| 121 | #   define LOCALE_REG_LP	RCX_LP | 
|---|
| 122 | #  else | 
|---|
| 123 | #   define LOCALE_REG	rdx | 
|---|
| 124 | #   define LOCALE_REG_LP	RDX_LP | 
|---|
| 125 | #  endif | 
|---|
| 126 | # endif | 
|---|
| 127 |  | 
|---|
| 128 | # define LCASE_MIN_YMM	%YMM12 | 
|---|
| 129 | # define LCASE_MAX_YMM	%YMM13 | 
|---|
| 130 | # define CASE_ADD_YMM	%YMM14 | 
|---|
| 131 |  | 
|---|
| 132 | # define LCASE_MIN_XMM	%XMM12 | 
|---|
| 133 | # define LCASE_MAX_XMM	%XMM13 | 
|---|
| 134 | # define CASE_ADD_XMM	%XMM14 | 
|---|
| 135 |  | 
|---|
| 136 | /* NB: wcsncmp uses r11 but strcasecmp is never used in | 
|---|
| 137 | conjunction with wcscmp.  */ | 
|---|
| 138 | # define TOLOWER_BASE	%r11 | 
|---|
| 139 |  | 
|---|
| 140 | # ifdef USE_AS_STRCASECMP_L | 
|---|
| 141 | #  define _REG(x, y) x ## y | 
|---|
| 142 | #  define REG(x, y) _REG(x, y) | 
|---|
| 143 | #  define TOLOWER(reg1, reg2, ext)										\ | 
|---|
| 144 | vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\ | 
|---|
| 145 | vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\ | 
|---|
| 146 | vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\ | 
|---|
| 147 | vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\ | 
|---|
| 148 | vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\ | 
|---|
| 149 | vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6} | 
|---|
| 150 |  | 
|---|
| 151 | #  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst | 
|---|
| 152 | #  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM) | 
|---|
| 153 | #  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM) | 
|---|
| 154 |  | 
|---|
| 155 | #  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\ | 
|---|
| 156 | TOLOWER	(s1_reg, s2_reg, ext);										\ | 
|---|
| 157 | VPCMP	$0, s1_reg, s2_reg, reg_out | 
|---|
| 158 |  | 
|---|
| 159 | #  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\ | 
|---|
| 160 | VMOVU	s2_mem, s2_reg;												\ | 
|---|
| 161 | CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) | 
|---|
| 162 |  | 
|---|
| 163 | #  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) | 
|---|
| 164 | #  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) | 
|---|
| 165 |  | 
|---|
| 166 | #  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) | 
|---|
| 167 | #  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) | 
|---|
| 168 |  | 
|---|
| 169 | # else | 
|---|
| 170 | #  define TOLOWER_gpr(...) | 
|---|
| 171 | #  define TOLOWER_YMM(...) | 
|---|
| 172 | #  define TOLOWER_XMM(...) | 
|---|
| 173 |  | 
|---|
| 174 | #  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\ | 
|---|
| 175 | VPCMP	$0, s2_reg, s1_reg, reg_out | 
|---|
| 176 |  | 
|---|
| 177 | #  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) | 
|---|
| 178 |  | 
|---|
| 179 | #  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\ | 
|---|
| 180 | VPCMP	$0, s2_mem, s1_reg, reg_out | 
|---|
| 181 |  | 
|---|
| 182 | #  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) | 
|---|
| 183 | # endif | 
|---|
| 184 |  | 
|---|
| 185 | /* Warning! | 
|---|
| 186 | wcscmp/wcsncmp have to use SIGNED comparison for elements. | 
|---|
| 187 | strcmp/strncmp have to use UNSIGNED comparison for elements. | 
|---|
| 188 | */ | 
|---|
| 189 |  | 
|---|
| 190 | /* The main idea of the string comparison (byte or dword) using 256-bit | 
|---|
| 191 | EVEX instructions consists of comparing (VPCMP) two ymm vectors. The | 
|---|
| 192 | latter can be on either packed bytes or dwords depending on | 
|---|
| 193 | USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the | 
|---|
| 194 | matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 | 
|---|
| 195 | KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) | 
|---|
| 196 | are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd | 
|---|
| 197 | instructions.  Main loop (away from from page boundary) compares 4 | 
|---|
| 198 | vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 | 
|---|
| 199 | bytes) on each loop. | 
|---|
| 200 |  | 
|---|
| 201 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic | 
|---|
| 202 | is the same as strcmp, except that an a maximum offset is tracked.  If | 
|---|
| 203 | the maximum offset is reached before a difference is found, zero is | 
|---|
| 204 | returned.  */ | 
|---|
| 205 |  | 
|---|
| 206 | .section .text.evex, "ax", @progbits | 
|---|
| 207 | .align	16 | 
|---|
| 208 | .type	STRCMP, @function | 
|---|
| 209 | .globl	STRCMP | 
|---|
| 210 | # ifdef USE_AS_STRCASECMP_L | 
|---|
| 211 | ENTRY (STRCASECMP) | 
|---|
| 212 | movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax | 
|---|
| 213 | mov	%fs:(%rax), %LOCALE_REG_LP | 
|---|
| 214 |  | 
|---|
| 215 | /* Either 1 or 5 bytes (dependeing if CET is enabled).  */ | 
|---|
| 216 | .p2align 4 | 
|---|
| 217 | END (STRCASECMP) | 
|---|
| 218 | /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */ | 
|---|
| 219 | # endif | 
|---|
| 220 |  | 
|---|
| 221 | .p2align 4 | 
|---|
| 222 | STRCMP: | 
|---|
| 223 | cfi_startproc | 
|---|
| 224 | _CET_ENDBR | 
|---|
| 225 | CALL_MCOUNT | 
|---|
| 226 |  | 
|---|
| 227 | # if defined USE_AS_STRCASECMP_L | 
|---|
| 228 | /* We have to fall back on the C implementation for locales with | 
|---|
| 229 | encodings not matching ASCII for single bytes.  */ | 
|---|
| 230 | #  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | 
|---|
| 231 | mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP | 
|---|
| 232 | #  else | 
|---|
| 233 | mov	(%LOCALE_REG), %RAX_LP | 
|---|
| 234 | #  endif | 
|---|
| 235 | testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) | 
|---|
| 236 | jne	STRCASECMP_L_NONASCII | 
|---|
| 237 | leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE | 
|---|
| 238 | # endif | 
|---|
| 239 |  | 
|---|
| 240 | # ifdef USE_AS_STRNCMP | 
|---|
| 241 | /* Don't overwrite LOCALE_REG (rcx) until we have pass | 
|---|
| 242 | L(one_or_less). Otherwise we might use the wrong locale in | 
|---|
| 243 | the OVERFLOW_STRCMP (strcasecmp_l).  */ | 
|---|
| 244 | #  ifdef __ILP32__ | 
|---|
| 245 | /* Clear the upper 32 bits.  */ | 
|---|
| 246 | movl	%edx, %edx | 
|---|
| 247 | #  endif | 
|---|
| 248 | cmp	$1, %RDX_LP | 
|---|
| 249 | /* Signed comparison intentional. We use this branch to also | 
|---|
| 250 | test cases where length >= 2^63. These very large sizes can be | 
|---|
| 251 | handled with strcmp as there is no way for that length to | 
|---|
| 252 | actually bound the buffer.  */ | 
|---|
| 253 | jle	L(one_or_less) | 
|---|
| 254 | # endif | 
|---|
| 255 |  | 
|---|
| 256 | # if defined USE_AS_STRCASECMP_L | 
|---|
| 257 | .section .rodata.cst32, "aM", @progbits, 32 | 
|---|
| 258 | .align	32 | 
|---|
| 259 | L(lcase_min): | 
|---|
| 260 | .quad	0x4141414141414141 | 
|---|
| 261 | .quad	0x4141414141414141 | 
|---|
| 262 | .quad	0x4141414141414141 | 
|---|
| 263 | .quad	0x4141414141414141 | 
|---|
| 264 | L(lcase_max): | 
|---|
| 265 | .quad	0x1a1a1a1a1a1a1a1a | 
|---|
| 266 | .quad	0x1a1a1a1a1a1a1a1a | 
|---|
| 267 | .quad	0x1a1a1a1a1a1a1a1a | 
|---|
| 268 | .quad	0x1a1a1a1a1a1a1a1a | 
|---|
| 269 | L(case_add): | 
|---|
| 270 | .quad	0x2020202020202020 | 
|---|
| 271 | .quad	0x2020202020202020 | 
|---|
| 272 | .quad	0x2020202020202020 | 
|---|
| 273 | .quad	0x2020202020202020 | 
|---|
| 274 | .previous | 
|---|
| 275 |  | 
|---|
| 276 | vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM | 
|---|
| 277 | vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM | 
|---|
| 278 | vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM | 
|---|
| 279 | # endif | 
|---|
| 280 |  | 
|---|
| 281 | movl	%edi, %eax | 
|---|
| 282 | orl	%esi, %eax | 
|---|
| 283 | /* Shift out the bits irrelivant to page boundary ([63:12]).  */ | 
|---|
| 284 | sall	$20, %eax | 
|---|
| 285 | /* Check if s1 or s2 may cross a page in next 4x VEC loads.  */ | 
|---|
| 286 | cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax | 
|---|
| 287 | ja	L(page_cross) | 
|---|
| 288 |  | 
|---|
| 289 | L(no_page_cross): | 
|---|
| 290 | /* Safe to compare 4x vectors.  */ | 
|---|
| 291 | VMOVU	(%rdi), %YMM0 | 
|---|
| 292 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 293 | /* Each bit cleared in K1 represents a mismatch or a null CHAR | 
|---|
| 294 | in YMM0 and 32 bytes at (%rsi).  */ | 
|---|
| 295 | CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} | 
|---|
| 296 | kmovd	%k1, %ecx | 
|---|
| 297 | # ifdef USE_AS_STRNCMP | 
|---|
| 298 | cmpq	$CHAR_PER_VEC, %rdx | 
|---|
| 299 | jbe	L(vec_0_test_len) | 
|---|
| 300 | # endif | 
|---|
| 301 |  | 
|---|
| 302 | /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for | 
|---|
| 303 | wcscmp/wcsncmp.  */ | 
|---|
| 304 |  | 
|---|
| 305 | /* All 1s represents all equals. TESTEQ will overflow to zero in | 
|---|
| 306 | all equals case. Otherwise 1s will carry until position of first | 
|---|
| 307 | mismatch.  */ | 
|---|
| 308 | TESTEQ	%ecx | 
|---|
| 309 | jz	L(more_3x_vec) | 
|---|
| 310 |  | 
|---|
| 311 | .p2align 4,, 4 | 
|---|
| 312 | L(return_vec_0): | 
|---|
| 313 | tzcntl	%ecx, %ecx | 
|---|
| 314 | # ifdef USE_AS_WCSCMP | 
|---|
| 315 | movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 316 | xorl	%eax, %eax | 
|---|
| 317 | cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 318 | je	L(ret0) | 
|---|
| 319 | setl	%al | 
|---|
| 320 | negl	%eax | 
|---|
| 321 | orl	$1, %eax | 
|---|
| 322 | # else | 
|---|
| 323 | movzbl	(%rdi, %rcx), %eax | 
|---|
| 324 | movzbl	(%rsi, %rcx), %ecx | 
|---|
| 325 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 326 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 327 | subl	%ecx, %eax | 
|---|
| 328 | # endif | 
|---|
| 329 | L(ret0): | 
|---|
| 330 | ret | 
|---|
| 331 |  | 
|---|
| 332 | # ifdef USE_AS_STRNCMP | 
|---|
| 333 | .p2align 4,, 4 | 
|---|
| 334 | L(vec_0_test_len): | 
|---|
| 335 | notl	%ecx | 
|---|
| 336 | bzhil	%edx, %ecx, %eax | 
|---|
| 337 | jnz	L(return_vec_0) | 
|---|
| 338 | /* Align if will cross fetch block.  */ | 
|---|
| 339 | .p2align 4,, 2 | 
|---|
| 340 | L(ret_zero): | 
|---|
| 341 | xorl	%eax, %eax | 
|---|
| 342 | ret | 
|---|
| 343 |  | 
|---|
| 344 | .p2align 4,, 5 | 
|---|
| 345 | L(one_or_less): | 
|---|
| 346 | #  ifdef USE_AS_STRCASECMP_L | 
|---|
| 347 | /* Set locale argument for strcasecmp.  */ | 
|---|
| 348 | movq	%LOCALE_REG, %rdx | 
|---|
| 349 | #  endif | 
|---|
| 350 | jb	L(ret_zero) | 
|---|
| 351 | /* 'nbe' covers the case where length is negative (large | 
|---|
| 352 | unsigned).  */ | 
|---|
| 353 | jnbe	OVERFLOW_STRCMP | 
|---|
| 354 | #  ifdef USE_AS_WCSCMP | 
|---|
| 355 | movl	(%rdi), %edx | 
|---|
| 356 | xorl	%eax, %eax | 
|---|
| 357 | cmpl	(%rsi), %edx | 
|---|
| 358 | je	L(ret1) | 
|---|
| 359 | setl	%al | 
|---|
| 360 | negl	%eax | 
|---|
| 361 | orl	$1, %eax | 
|---|
| 362 | #  else | 
|---|
| 363 | movzbl	(%rdi), %eax | 
|---|
| 364 | movzbl	(%rsi), %ecx | 
|---|
| 365 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 366 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 367 | subl	%ecx, %eax | 
|---|
| 368 | #  endif | 
|---|
| 369 | L(ret1): | 
|---|
| 370 | ret | 
|---|
| 371 | # endif | 
|---|
| 372 |  | 
|---|
| 373 | .p2align 4,, 10 | 
|---|
| 374 | L(return_vec_1): | 
|---|
| 375 | tzcntl	%ecx, %ecx | 
|---|
| 376 | # ifdef USE_AS_STRNCMP | 
|---|
| 377 | /* rdx must be > CHAR_PER_VEC so its safe to subtract without | 
|---|
| 378 | worrying about underflow.  */ | 
|---|
| 379 | addq	$-CHAR_PER_VEC, %rdx | 
|---|
| 380 | cmpq	%rcx, %rdx | 
|---|
| 381 | jbe	L(ret_zero) | 
|---|
| 382 | # endif | 
|---|
| 383 | # ifdef USE_AS_WCSCMP | 
|---|
| 384 | movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 385 | xorl	%eax, %eax | 
|---|
| 386 | cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 387 | je	L(ret2) | 
|---|
| 388 | setl	%al | 
|---|
| 389 | negl	%eax | 
|---|
| 390 | orl	$1, %eax | 
|---|
| 391 | # else | 
|---|
| 392 | movzbl	VEC_SIZE(%rdi, %rcx), %eax | 
|---|
| 393 | movzbl	VEC_SIZE(%rsi, %rcx), %ecx | 
|---|
| 394 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 395 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 396 | subl	%ecx, %eax | 
|---|
| 397 | # endif | 
|---|
| 398 | L(ret2): | 
|---|
| 399 | ret | 
|---|
| 400 |  | 
|---|
| 401 | .p2align 4,, 10 | 
|---|
| 402 | # ifdef USE_AS_STRNCMP | 
|---|
| 403 | L(return_vec_3): | 
|---|
| 404 | #  if CHAR_PER_VEC <= 16 | 
|---|
| 405 | sall	$CHAR_PER_VEC, %ecx | 
|---|
| 406 | #  else | 
|---|
| 407 | salq	$CHAR_PER_VEC, %rcx | 
|---|
| 408 | #  endif | 
|---|
| 409 | # endif | 
|---|
| 410 | L(return_vec_2): | 
|---|
| 411 | # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) | 
|---|
| 412 | tzcntl	%ecx, %ecx | 
|---|
| 413 | # else | 
|---|
| 414 | tzcntq	%rcx, %rcx | 
|---|
| 415 | # endif | 
|---|
| 416 |  | 
|---|
| 417 | # ifdef USE_AS_STRNCMP | 
|---|
| 418 | cmpq	%rcx, %rdx | 
|---|
| 419 | jbe	L(ret_zero) | 
|---|
| 420 | # endif | 
|---|
| 421 |  | 
|---|
| 422 | # ifdef USE_AS_WCSCMP | 
|---|
| 423 | movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 424 | xorl	%eax, %eax | 
|---|
| 425 | cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 426 | je	L(ret3) | 
|---|
| 427 | setl	%al | 
|---|
| 428 | negl	%eax | 
|---|
| 429 | orl	$1, %eax | 
|---|
| 430 | # else | 
|---|
| 431 | movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax | 
|---|
| 432 | movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx | 
|---|
| 433 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 434 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 435 | subl	%ecx, %eax | 
|---|
| 436 | # endif | 
|---|
| 437 | L(ret3): | 
|---|
| 438 | ret | 
|---|
| 439 |  | 
|---|
| 440 | # ifndef USE_AS_STRNCMP | 
|---|
| 441 | .p2align 4,, 10 | 
|---|
| 442 | L(return_vec_3): | 
|---|
| 443 | tzcntl	%ecx, %ecx | 
|---|
| 444 | #  ifdef USE_AS_WCSCMP | 
|---|
| 445 | movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 446 | xorl	%eax, %eax | 
|---|
| 447 | cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 448 | je	L(ret4) | 
|---|
| 449 | setl	%al | 
|---|
| 450 | negl	%eax | 
|---|
| 451 | orl	$1, %eax | 
|---|
| 452 | #  else | 
|---|
| 453 | movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax | 
|---|
| 454 | movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx | 
|---|
| 455 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 456 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 457 | subl	%ecx, %eax | 
|---|
| 458 | #  endif | 
|---|
| 459 | L(ret4): | 
|---|
| 460 | ret | 
|---|
| 461 | # endif | 
|---|
| 462 |  | 
|---|
| 463 | /* 32 byte align here ensures the main loop is ideally aligned | 
|---|
| 464 | for DSB.  */ | 
|---|
| 465 | .p2align 5 | 
|---|
| 466 | L(more_3x_vec): | 
|---|
| 467 | /* Safe to compare 4x vectors.  */ | 
|---|
| 468 | VMOVU	(VEC_SIZE)(%rdi), %YMM0 | 
|---|
| 469 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 470 | CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} | 
|---|
| 471 | kmovd	%k1, %ecx | 
|---|
| 472 | TESTEQ	%ecx | 
|---|
| 473 | jnz	L(return_vec_1) | 
|---|
| 474 |  | 
|---|
| 475 | # ifdef USE_AS_STRNCMP | 
|---|
| 476 | subq	$(CHAR_PER_VEC * 2), %rdx | 
|---|
| 477 | jbe	L(ret_zero) | 
|---|
| 478 | # endif | 
|---|
| 479 |  | 
|---|
| 480 | VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0 | 
|---|
| 481 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 482 | CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} | 
|---|
| 483 | kmovd	%k1, %ecx | 
|---|
| 484 | TESTEQ	%ecx | 
|---|
| 485 | jnz	L(return_vec_2) | 
|---|
| 486 |  | 
|---|
| 487 | VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0 | 
|---|
| 488 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 489 | CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} | 
|---|
| 490 | kmovd	%k1, %ecx | 
|---|
| 491 | TESTEQ	%ecx | 
|---|
| 492 | jnz	L(return_vec_3) | 
|---|
| 493 |  | 
|---|
| 494 | # ifdef USE_AS_STRNCMP | 
|---|
| 495 | cmpq	$(CHAR_PER_VEC * 2), %rdx | 
|---|
| 496 | jbe	L(ret_zero) | 
|---|
| 497 | # endif | 
|---|
| 498 |  | 
|---|
| 499 |  | 
|---|
| 500 | # ifdef USE_AS_WCSCMP | 
|---|
| 501 | /* any non-zero positive value that doesn't inference with 0x1. | 
|---|
| 502 | */ | 
|---|
| 503 | movl	$2, %r8d | 
|---|
| 504 |  | 
|---|
| 505 | # else | 
|---|
| 506 | xorl	%r8d, %r8d | 
|---|
| 507 | # endif | 
|---|
| 508 |  | 
|---|
| 509 | /* The prepare labels are various entry points from the page | 
|---|
| 510 | cross logic.  */ | 
|---|
| 511 | L(prepare_loop): | 
|---|
| 512 |  | 
|---|
| 513 | # ifdef USE_AS_STRNCMP | 
|---|
| 514 | #  ifdef USE_AS_WCSCMP | 
|---|
| 515 | L(prepare_loop_no_len): | 
|---|
| 516 | movl	%edi, %ecx | 
|---|
| 517 | andl	$(VEC_SIZE * 4 - 1), %ecx | 
|---|
| 518 | shrl	$2, %ecx | 
|---|
| 519 | leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx | 
|---|
| 520 | #  else | 
|---|
| 521 | /* Store N + (VEC_SIZE * 4) and place check at the begining of | 
|---|
| 522 | the loop.  */ | 
|---|
| 523 | leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx | 
|---|
| 524 | L(prepare_loop_no_len): | 
|---|
| 525 | #  endif | 
|---|
| 526 | # else | 
|---|
| 527 | L(prepare_loop_no_len): | 
|---|
| 528 | # endif | 
|---|
| 529 |  | 
|---|
| 530 | /* Align s1 and adjust s2 accordingly.  */ | 
|---|
| 531 | subq	%rdi, %rsi | 
|---|
| 532 | andq	$-(VEC_SIZE * 4), %rdi | 
|---|
| 533 | L(prepare_loop_readj): | 
|---|
| 534 | addq	%rdi, %rsi | 
|---|
| 535 | # if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) | 
|---|
| 536 | subq	%rdi, %rdx | 
|---|
| 537 | # endif | 
|---|
| 538 |  | 
|---|
| 539 | L(prepare_loop_aligned): | 
|---|
| 540 | /* eax stores distance from rsi to next page cross. These cases | 
|---|
| 541 | need to be handled specially as the 4x loop could potentially | 
|---|
| 542 | read memory past the length of s1 or s2 and across a page | 
|---|
| 543 | boundary.  */ | 
|---|
| 544 | movl	$-(VEC_SIZE * 4), %eax | 
|---|
| 545 | subl	%esi, %eax | 
|---|
| 546 | andl	$(PAGE_SIZE - 1), %eax | 
|---|
| 547 |  | 
|---|
| 548 |  | 
|---|
| 549 | /* Loop 4x comparisons at a time.  */ | 
|---|
| 550 | .p2align 4 | 
|---|
| 551 | L(loop): | 
|---|
| 552 |  | 
|---|
| 553 | /* End condition for strncmp.  */ | 
|---|
| 554 | # ifdef USE_AS_STRNCMP | 
|---|
| 555 | subq	$(CHAR_PER_VEC * 4), %rdx | 
|---|
| 556 | jbe	L(ret_zero) | 
|---|
| 557 | # endif | 
|---|
| 558 |  | 
|---|
| 559 | subq	$-(VEC_SIZE * 4), %rdi | 
|---|
| 560 | subq	$-(VEC_SIZE * 4), %rsi | 
|---|
| 561 |  | 
|---|
| 562 | /* Check if rsi loads will cross a page boundary.  */ | 
|---|
| 563 | addl	$-(VEC_SIZE * 4), %eax | 
|---|
| 564 | jnb	L(page_cross_during_loop) | 
|---|
| 565 |  | 
|---|
| 566 | /* Loop entry after handling page cross during loop.  */ | 
|---|
| 567 | L(loop_skip_page_cross_check): | 
|---|
| 568 | VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0 | 
|---|
| 569 | VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2 | 
|---|
| 570 | VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4 | 
|---|
| 571 | VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6 | 
|---|
| 572 |  | 
|---|
| 573 | VPMINU	%YMM0, %YMM2, %YMM8 | 
|---|
| 574 | VPMINU	%YMM4, %YMM6, %YMM9 | 
|---|
| 575 |  | 
|---|
| 576 | /* A zero CHAR in YMM9 means that there is a null CHAR.  */ | 
|---|
| 577 | VPMINU	%YMM8, %YMM9, %YMM9 | 
|---|
| 578 |  | 
|---|
| 579 | /* Each bit set in K1 represents a non-null CHAR in YMM9.  */ | 
|---|
| 580 | VPTESTM	%YMM9, %YMM9, %k1 | 
|---|
| 581 | # ifndef USE_AS_STRCASECMP_L | 
|---|
| 582 | vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 | 
|---|
| 583 | vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 | 
|---|
| 584 | vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 | 
|---|
| 585 | /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while | 
|---|
| 586 | oring with YMM1. Result is stored in YMM6.  */ | 
|---|
| 587 | vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 | 
|---|
| 588 | # else | 
|---|
| 589 | VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1 | 
|---|
| 590 | TOLOWER_YMM (%YMM0, %YMM1) | 
|---|
| 591 | VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3 | 
|---|
| 592 | TOLOWER_YMM (%YMM2, %YMM3) | 
|---|
| 593 | VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5 | 
|---|
| 594 | TOLOWER_YMM (%YMM4, %YMM5) | 
|---|
| 595 | VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7 | 
|---|
| 596 | TOLOWER_YMM (%YMM6, %YMM7) | 
|---|
| 597 | vpxorq	%YMM0, %YMM1, %YMM1 | 
|---|
| 598 | vpxorq	%YMM2, %YMM3, %YMM3 | 
|---|
| 599 | vpxorq	%YMM4, %YMM5, %YMM5 | 
|---|
| 600 | vpternlogd $0xde, %YMM7, %YMM1, %YMM6 | 
|---|
| 601 | # endif | 
|---|
| 602 | /* Or together YMM3, YMM5, and YMM6.  */ | 
|---|
| 603 | vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 | 
|---|
| 604 |  | 
|---|
| 605 |  | 
|---|
| 606 | /* A non-zero CHAR in YMM6 represents a mismatch.  */ | 
|---|
| 607 | VPTESTNM %YMM6, %YMM6, %k0{%k1} | 
|---|
| 608 | kmovd	%k0, %LOOP_REG | 
|---|
| 609 |  | 
|---|
| 610 | TESTEQ	%LOOP_REG | 
|---|
| 611 | jz	L(loop) | 
|---|
| 612 |  | 
|---|
| 613 |  | 
|---|
| 614 | /* Find which VEC has the mismatch of end of string.  */ | 
|---|
| 615 | VPTESTM	%YMM0, %YMM0, %k1 | 
|---|
| 616 | VPTESTNM %YMM1, %YMM1, %k0{%k1} | 
|---|
| 617 | kmovd	%k0, %ecx | 
|---|
| 618 | TESTEQ	%ecx | 
|---|
| 619 | jnz	L(return_vec_0_end) | 
|---|
| 620 |  | 
|---|
| 621 | VPTESTM	%YMM2, %YMM2, %k1 | 
|---|
| 622 | VPTESTNM %YMM3, %YMM3, %k0{%k1} | 
|---|
| 623 | kmovd	%k0, %ecx | 
|---|
| 624 | TESTEQ	%ecx | 
|---|
| 625 | jnz	L(return_vec_1_end) | 
|---|
| 626 |  | 
|---|
| 627 |  | 
|---|
| 628 | /* Handle VEC 2 and 3 without branches.  */ | 
|---|
| 629 | L(return_vec_2_3_end): | 
|---|
| 630 | # ifdef USE_AS_STRNCMP | 
|---|
| 631 | subq	$(CHAR_PER_VEC * 2), %rdx | 
|---|
| 632 | jbe	L(ret_zero_end) | 
|---|
| 633 | # endif | 
|---|
| 634 |  | 
|---|
| 635 | VPTESTM	%YMM4, %YMM4, %k1 | 
|---|
| 636 | VPTESTNM %YMM5, %YMM5, %k0{%k1} | 
|---|
| 637 | kmovd	%k0, %ecx | 
|---|
| 638 | TESTEQ	%ecx | 
|---|
| 639 | # if CHAR_PER_VEC <= 16 | 
|---|
| 640 | sall	$CHAR_PER_VEC, %LOOP_REG | 
|---|
| 641 | orl	%ecx, %LOOP_REG | 
|---|
| 642 | # else | 
|---|
| 643 | salq	$CHAR_PER_VEC, %LOOP_REG64 | 
|---|
| 644 | orq	%rcx, %LOOP_REG64 | 
|---|
| 645 | # endif | 
|---|
| 646 | L(return_vec_3_end): | 
|---|
| 647 | /* LOOP_REG contains matches for null/mismatch from the loop. If | 
|---|
| 648 | VEC 0,1,and 2 all have no null and no mismatches then mismatch | 
|---|
| 649 | must entirely be from VEC 3 which is fully represented by | 
|---|
| 650 | LOOP_REG.  */ | 
|---|
| 651 | # if CHAR_PER_VEC <= 16 | 
|---|
| 652 | tzcntl	%LOOP_REG, %LOOP_REG | 
|---|
| 653 | # else | 
|---|
| 654 | tzcntq	%LOOP_REG64, %LOOP_REG64 | 
|---|
| 655 | # endif | 
|---|
| 656 | # ifdef USE_AS_STRNCMP | 
|---|
| 657 | cmpq	%LOOP_REG64, %rdx | 
|---|
| 658 | jbe	L(ret_zero_end) | 
|---|
| 659 | # endif | 
|---|
| 660 |  | 
|---|
| 661 | # ifdef USE_AS_WCSCMP | 
|---|
| 662 | movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx | 
|---|
| 663 | xorl	%eax, %eax | 
|---|
| 664 | cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx | 
|---|
| 665 | je	L(ret5) | 
|---|
| 666 | setl	%al | 
|---|
| 667 | negl	%eax | 
|---|
| 668 | xorl	%r8d, %eax | 
|---|
| 669 | # else | 
|---|
| 670 | movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax | 
|---|
| 671 | movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx | 
|---|
| 672 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 673 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 674 | subl	%ecx, %eax | 
|---|
| 675 | xorl	%r8d, %eax | 
|---|
| 676 | subl	%r8d, %eax | 
|---|
| 677 | # endif | 
|---|
| 678 | L(ret5): | 
|---|
| 679 | ret | 
|---|
| 680 |  | 
|---|
| 681 | # ifdef USE_AS_STRNCMP | 
|---|
| 682 | .p2align 4,, 2 | 
|---|
| 683 | L(ret_zero_end): | 
|---|
| 684 | xorl	%eax, %eax | 
|---|
| 685 | ret | 
|---|
| 686 | # endif | 
|---|
| 687 |  | 
|---|
| 688 |  | 
|---|
| 689 | /* The L(return_vec_N_end) differ from L(return_vec_N) in that | 
|---|
| 690 | they use the value of `r8` to negate the return value. This is | 
|---|
| 691 | because the page cross logic can swap `rdi` and `rsi`.  */ | 
|---|
| 692 | .p2align 4,, 10 | 
|---|
| 693 | # ifdef USE_AS_STRNCMP | 
|---|
| 694 | L(return_vec_1_end): | 
|---|
| 695 | #  if CHAR_PER_VEC <= 16 | 
|---|
| 696 | sall	$CHAR_PER_VEC, %ecx | 
|---|
| 697 | #  else | 
|---|
| 698 | salq	$CHAR_PER_VEC, %rcx | 
|---|
| 699 | #  endif | 
|---|
| 700 | # endif | 
|---|
| 701 | L(return_vec_0_end): | 
|---|
| 702 | # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) | 
|---|
| 703 | tzcntl	%ecx, %ecx | 
|---|
| 704 | # else | 
|---|
| 705 | tzcntq	%rcx, %rcx | 
|---|
| 706 | # endif | 
|---|
| 707 |  | 
|---|
| 708 | # ifdef USE_AS_STRNCMP | 
|---|
| 709 | cmpq	%rcx, %rdx | 
|---|
| 710 | jbe	L(ret_zero_end) | 
|---|
| 711 | # endif | 
|---|
| 712 |  | 
|---|
| 713 | # ifdef USE_AS_WCSCMP | 
|---|
| 714 | movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 715 | xorl	%eax, %eax | 
|---|
| 716 | cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 717 | je	L(ret6) | 
|---|
| 718 | setl	%al | 
|---|
| 719 | negl	%eax | 
|---|
| 720 | /* This is the non-zero case for `eax` so just xorl with `r8d` | 
|---|
| 721 | flip is `rdi` and `rsi` where swapped.  */ | 
|---|
| 722 | xorl	%r8d, %eax | 
|---|
| 723 | # else | 
|---|
| 724 | movzbl	(%rdi, %rcx), %eax | 
|---|
| 725 | movzbl	(%rsi, %rcx), %ecx | 
|---|
| 726 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 727 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 728 | subl	%ecx, %eax | 
|---|
| 729 | /* Flip `eax` if `rdi` and `rsi` where swapped in page cross | 
|---|
| 730 | logic. Subtract `r8d` after xor for zero case.  */ | 
|---|
| 731 | xorl	%r8d, %eax | 
|---|
| 732 | subl	%r8d, %eax | 
|---|
| 733 | # endif | 
|---|
| 734 | L(ret6): | 
|---|
| 735 | ret | 
|---|
| 736 |  | 
|---|
| 737 | # ifndef USE_AS_STRNCMP | 
|---|
| 738 | .p2align 4,, 10 | 
|---|
| 739 | L(return_vec_1_end): | 
|---|
| 740 | tzcntl	%ecx, %ecx | 
|---|
| 741 | #  ifdef USE_AS_WCSCMP | 
|---|
| 742 | movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 743 | xorl	%eax, %eax | 
|---|
| 744 | cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 745 | je	L(ret7) | 
|---|
| 746 | setl	%al | 
|---|
| 747 | negl	%eax | 
|---|
| 748 | xorl	%r8d, %eax | 
|---|
| 749 | #  else | 
|---|
| 750 | movzbl	VEC_SIZE(%rdi, %rcx), %eax | 
|---|
| 751 | movzbl	VEC_SIZE(%rsi, %rcx), %ecx | 
|---|
| 752 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 753 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 754 | subl	%ecx, %eax | 
|---|
| 755 | xorl	%r8d, %eax | 
|---|
| 756 | subl	%r8d, %eax | 
|---|
| 757 | #  endif | 
|---|
| 758 | L(ret7): | 
|---|
| 759 | ret | 
|---|
| 760 | # endif | 
|---|
| 761 |  | 
|---|
| 762 |  | 
|---|
| 763 | /* Page cross in rsi in next 4x VEC.  */ | 
|---|
| 764 |  | 
|---|
| 765 | /* TODO: Improve logic here.  */ | 
|---|
| 766 | .p2align 4,, 10 | 
|---|
| 767 | L(page_cross_during_loop): | 
|---|
| 768 | /* eax contains [distance_from_page - (VEC_SIZE * 4)].  */ | 
|---|
| 769 |  | 
|---|
| 770 | /* Optimistically rsi and rdi and both aligned in which case we | 
|---|
| 771 | don't need any logic here.  */ | 
|---|
| 772 | cmpl	$-(VEC_SIZE * 4), %eax | 
|---|
| 773 | /* Don't adjust eax before jumping back to loop and we will | 
|---|
| 774 | never hit page cross case again.  */ | 
|---|
| 775 | je	L(loop_skip_page_cross_check) | 
|---|
| 776 |  | 
|---|
| 777 | /* Check if we can safely load a VEC.  */ | 
|---|
| 778 | cmpl	$-(VEC_SIZE * 3), %eax | 
|---|
| 779 | jle	L(less_1x_vec_till_page_cross) | 
|---|
| 780 |  | 
|---|
| 781 | VMOVA	(%rdi), %YMM0 | 
|---|
| 782 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 783 | CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} | 
|---|
| 784 | kmovd	%k1, %ecx | 
|---|
| 785 | TESTEQ	%ecx | 
|---|
| 786 | jnz	L(return_vec_0_end) | 
|---|
| 787 |  | 
|---|
| 788 | /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */ | 
|---|
| 789 | cmpl	$-(VEC_SIZE * 2), %eax | 
|---|
| 790 | jg	L(more_2x_vec_till_page_cross) | 
|---|
| 791 |  | 
|---|
| 792 | .p2align 4,, 4 | 
|---|
| 793 | L(less_1x_vec_till_page_cross): | 
|---|
| 794 | subl	$-(VEC_SIZE * 4), %eax | 
|---|
| 795 | /* Guranteed safe to read from rdi - VEC_SIZE here. The only | 
|---|
| 796 | concerning case is first iteration if incoming s1 was near start | 
|---|
| 797 | of a page and s2 near end. If s1 was near the start of the page | 
|---|
| 798 | we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe | 
|---|
| 799 | to read back -VEC_SIZE. If rdi is truly at the start of a page | 
|---|
| 800 | here, it means the previous page (rdi - VEC_SIZE) has already | 
|---|
| 801 | been loaded earlier so must be valid.  */ | 
|---|
| 802 | VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0 | 
|---|
| 803 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 804 | CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} | 
|---|
| 805 | /* Mask of potentially valid bits. The lower bits can be out of | 
|---|
| 806 | range comparisons (but safe regarding page crosses).  */ | 
|---|
| 807 |  | 
|---|
| 808 | # ifdef USE_AS_WCSCMP | 
|---|
| 809 | movl	$-1, %r10d | 
|---|
| 810 | movl	%esi, %ecx | 
|---|
| 811 | andl	$(VEC_SIZE - 1), %ecx | 
|---|
| 812 | shrl	$2, %ecx | 
|---|
| 813 | shlxl	%ecx, %r10d, %ecx | 
|---|
| 814 | movzbl	%cl, %r10d | 
|---|
| 815 | # else | 
|---|
| 816 | movl	$-1, %ecx | 
|---|
| 817 | shlxl	%esi, %ecx, %r10d | 
|---|
| 818 | # endif | 
|---|
| 819 |  | 
|---|
| 820 | kmovd	%k1, %ecx | 
|---|
| 821 | notl	%ecx | 
|---|
| 822 |  | 
|---|
| 823 |  | 
|---|
| 824 | # ifdef USE_AS_STRNCMP | 
|---|
| 825 | #  ifdef USE_AS_WCSCMP | 
|---|
| 826 | /* NB: strcasecmp not used with WCSCMP so this access to r11 is | 
|---|
| 827 | safe.  */ | 
|---|
| 828 | movl	%eax, %r11d | 
|---|
| 829 | shrl	$2, %r11d | 
|---|
| 830 | cmpq	%r11, %rdx | 
|---|
| 831 | #  else | 
|---|
| 832 | cmpq	%rax, %rdx | 
|---|
| 833 | #  endif | 
|---|
| 834 | jbe	L(return_page_cross_end_check) | 
|---|
| 835 | # endif | 
|---|
| 836 | movl	%eax, %OFFSET_REG | 
|---|
| 837 |  | 
|---|
| 838 | /* Readjust eax before potentially returning to the loop.  */ | 
|---|
| 839 | addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax | 
|---|
| 840 |  | 
|---|
| 841 | andl	%r10d, %ecx | 
|---|
| 842 | jz	L(loop_skip_page_cross_check) | 
|---|
| 843 |  | 
|---|
| 844 | .p2align 4,, 3 | 
|---|
| 845 | L(return_page_cross_end): | 
|---|
| 846 | tzcntl	%ecx, %ecx | 
|---|
| 847 |  | 
|---|
| 848 | # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) | 
|---|
| 849 | leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx | 
|---|
| 850 | L(return_page_cross_cmp_mem): | 
|---|
| 851 | # else | 
|---|
| 852 | addl	%OFFSET_REG, %ecx | 
|---|
| 853 | # endif | 
|---|
| 854 | # ifdef USE_AS_WCSCMP | 
|---|
| 855 | movl	VEC_OFFSET(%rdi, %rcx), %edx | 
|---|
| 856 | xorl	%eax, %eax | 
|---|
| 857 | cmpl	VEC_OFFSET(%rsi, %rcx), %edx | 
|---|
| 858 | je	L(ret8) | 
|---|
| 859 | setl	%al | 
|---|
| 860 | negl	%eax | 
|---|
| 861 | xorl	%r8d, %eax | 
|---|
| 862 | # else | 
|---|
| 863 | movzbl	VEC_OFFSET(%rdi, %rcx), %eax | 
|---|
| 864 | movzbl	VEC_OFFSET(%rsi, %rcx), %ecx | 
|---|
| 865 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 866 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 867 | subl	%ecx, %eax | 
|---|
| 868 | xorl	%r8d, %eax | 
|---|
| 869 | subl	%r8d, %eax | 
|---|
| 870 | # endif | 
|---|
| 871 | L(ret8): | 
|---|
| 872 | ret | 
|---|
| 873 |  | 
|---|
| 874 | # ifdef USE_AS_STRNCMP | 
|---|
| 875 | .p2align 4,, 10 | 
|---|
| 876 | L(return_page_cross_end_check): | 
|---|
| 877 | andl	%r10d, %ecx | 
|---|
| 878 | tzcntl	%ecx, %ecx | 
|---|
| 879 | leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx | 
|---|
| 880 | #  ifdef USE_AS_WCSCMP | 
|---|
| 881 | sall	$2, %edx | 
|---|
| 882 | #  endif | 
|---|
| 883 | cmpl	%ecx, %edx | 
|---|
| 884 | ja	L(return_page_cross_cmp_mem) | 
|---|
| 885 | xorl	%eax, %eax | 
|---|
| 886 | ret | 
|---|
| 887 | # endif | 
|---|
| 888 |  | 
|---|
| 889 |  | 
|---|
| 890 | .p2align 4,, 10 | 
|---|
| 891 | L(more_2x_vec_till_page_cross): | 
|---|
| 892 | /* If more 2x vec till cross we will complete a full loop | 
|---|
| 893 | iteration here.  */ | 
|---|
| 894 |  | 
|---|
| 895 | VMOVA	VEC_SIZE(%rdi), %YMM0 | 
|---|
| 896 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 897 | CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} | 
|---|
| 898 | kmovd	%k1, %ecx | 
|---|
| 899 | TESTEQ	%ecx | 
|---|
| 900 | jnz	L(return_vec_1_end) | 
|---|
| 901 |  | 
|---|
| 902 | # ifdef USE_AS_STRNCMP | 
|---|
| 903 | cmpq	$(CHAR_PER_VEC * 2), %rdx | 
|---|
| 904 | jbe	L(ret_zero_in_loop_page_cross) | 
|---|
| 905 | # endif | 
|---|
| 906 |  | 
|---|
| 907 | subl	$-(VEC_SIZE * 4), %eax | 
|---|
| 908 |  | 
|---|
| 909 | /* Safe to include comparisons from lower bytes.  */ | 
|---|
| 910 | VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0 | 
|---|
| 911 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 912 | CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} | 
|---|
| 913 | kmovd	%k1, %ecx | 
|---|
| 914 | TESTEQ	%ecx | 
|---|
| 915 | jnz	L(return_vec_page_cross_0) | 
|---|
| 916 |  | 
|---|
| 917 | VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0 | 
|---|
| 918 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 919 | CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} | 
|---|
| 920 | kmovd	%k1, %ecx | 
|---|
| 921 | TESTEQ	%ecx | 
|---|
| 922 | jnz	L(return_vec_page_cross_1) | 
|---|
| 923 |  | 
|---|
| 924 | # ifdef USE_AS_STRNCMP | 
|---|
| 925 | /* Must check length here as length might proclude reading next | 
|---|
| 926 | page.  */ | 
|---|
| 927 | #  ifdef USE_AS_WCSCMP | 
|---|
| 928 | /* NB: strcasecmp not used with WCSCMP so this access to r11 is | 
|---|
| 929 | safe.  */ | 
|---|
| 930 | movl	%eax, %r11d | 
|---|
| 931 | shrl	$2, %r11d | 
|---|
| 932 | cmpq	%r11, %rdx | 
|---|
| 933 | #  else | 
|---|
| 934 | cmpq	%rax, %rdx | 
|---|
| 935 | #  endif | 
|---|
| 936 | jbe	L(ret_zero_in_loop_page_cross) | 
|---|
| 937 | # endif | 
|---|
| 938 |  | 
|---|
| 939 | /* Finish the loop.  */ | 
|---|
| 940 | VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4 | 
|---|
| 941 | VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6 | 
|---|
| 942 | VPMINU	%YMM4, %YMM6, %YMM9 | 
|---|
| 943 | VPTESTM	%YMM9, %YMM9, %k1 | 
|---|
| 944 | # ifndef USE_AS_STRCASECMP_L | 
|---|
| 945 | vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 | 
|---|
| 946 | /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */ | 
|---|
| 947 | vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 | 
|---|
| 948 | # else | 
|---|
| 949 | VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5 | 
|---|
| 950 | TOLOWER_YMM (%YMM4, %YMM5) | 
|---|
| 951 | VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7 | 
|---|
| 952 | TOLOWER_YMM (%YMM6, %YMM7) | 
|---|
| 953 | vpxorq	%YMM4, %YMM5, %YMM5 | 
|---|
| 954 | vpternlogd $0xde, %YMM7, %YMM5, %YMM6 | 
|---|
| 955 | # endif | 
|---|
| 956 | VPTESTNM %YMM6, %YMM6, %k0{%k1} | 
|---|
| 957 | kmovd	%k0, %LOOP_REG | 
|---|
| 958 | TESTEQ	%LOOP_REG | 
|---|
| 959 | jnz	L(return_vec_2_3_end) | 
|---|
| 960 |  | 
|---|
| 961 | /* Best for code size to include ucond-jmp here. Would be faster | 
|---|
| 962 | if this case is hot to duplicate the L(return_vec_2_3_end) code | 
|---|
| 963 | as fall-through and have jump back to loop on mismatch | 
|---|
| 964 | comparison.  */ | 
|---|
| 965 | subq	$-(VEC_SIZE * 4), %rdi | 
|---|
| 966 | subq	$-(VEC_SIZE * 4), %rsi | 
|---|
| 967 | addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax | 
|---|
| 968 | # ifdef USE_AS_STRNCMP | 
|---|
| 969 | subq	$(CHAR_PER_VEC * 4), %rdx | 
|---|
| 970 | ja	L(loop_skip_page_cross_check) | 
|---|
| 971 | L(ret_zero_in_loop_page_cross): | 
|---|
| 972 | xorl	%eax, %eax | 
|---|
| 973 | ret | 
|---|
| 974 | # else | 
|---|
| 975 | jmp	L(loop_skip_page_cross_check) | 
|---|
| 976 | # endif | 
|---|
| 977 |  | 
|---|
| 978 |  | 
|---|
| 979 | .p2align 4,, 10 | 
|---|
| 980 | L(return_vec_page_cross_0): | 
|---|
| 981 | addl	$-VEC_SIZE, %eax | 
|---|
| 982 | L(return_vec_page_cross_1): | 
|---|
| 983 | tzcntl	%ecx, %ecx | 
|---|
| 984 | # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP | 
|---|
| 985 | leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx | 
|---|
| 986 | #  ifdef USE_AS_STRNCMP | 
|---|
| 987 | #   ifdef USE_AS_WCSCMP | 
|---|
| 988 | /* Must divide ecx instead of multiply rdx due to overflow.  */ | 
|---|
| 989 | movl	%ecx, %eax | 
|---|
| 990 | shrl	$2, %eax | 
|---|
| 991 | cmpq	%rax, %rdx | 
|---|
| 992 | #   else | 
|---|
| 993 | cmpq	%rcx, %rdx | 
|---|
| 994 | #   endif | 
|---|
| 995 | jbe	L(ret_zero_in_loop_page_cross) | 
|---|
| 996 | #  endif | 
|---|
| 997 | # else | 
|---|
| 998 | addl	%eax, %ecx | 
|---|
| 999 | # endif | 
|---|
| 1000 |  | 
|---|
| 1001 | # ifdef USE_AS_WCSCMP | 
|---|
| 1002 | movl	VEC_OFFSET(%rdi, %rcx), %edx | 
|---|
| 1003 | xorl	%eax, %eax | 
|---|
| 1004 | cmpl	VEC_OFFSET(%rsi, %rcx), %edx | 
|---|
| 1005 | je	L(ret9) | 
|---|
| 1006 | setl	%al | 
|---|
| 1007 | negl	%eax | 
|---|
| 1008 | xorl	%r8d, %eax | 
|---|
| 1009 | # else | 
|---|
| 1010 | movzbl	VEC_OFFSET(%rdi, %rcx), %eax | 
|---|
| 1011 | movzbl	VEC_OFFSET(%rsi, %rcx), %ecx | 
|---|
| 1012 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 1013 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 1014 | subl	%ecx, %eax | 
|---|
| 1015 | xorl	%r8d, %eax | 
|---|
| 1016 | subl	%r8d, %eax | 
|---|
| 1017 | # endif | 
|---|
| 1018 | L(ret9): | 
|---|
| 1019 | ret | 
|---|
| 1020 |  | 
|---|
| 1021 |  | 
|---|
| 1022 | .p2align 4,, 10 | 
|---|
| 1023 | L(page_cross): | 
|---|
| 1024 | # ifndef USE_AS_STRNCMP | 
|---|
| 1025 | /* If both are VEC aligned we don't need any special logic here. | 
|---|
| 1026 | Only valid for strcmp where stop condition is guranteed to be | 
|---|
| 1027 | reachable by just reading memory.  */ | 
|---|
| 1028 | testl	$((VEC_SIZE - 1) << 20), %eax | 
|---|
| 1029 | jz	L(no_page_cross) | 
|---|
| 1030 | # endif | 
|---|
| 1031 |  | 
|---|
| 1032 | movl	%edi, %eax | 
|---|
| 1033 | movl	%esi, %ecx | 
|---|
| 1034 | andl	$(PAGE_SIZE - 1), %eax | 
|---|
| 1035 | andl	$(PAGE_SIZE - 1), %ecx | 
|---|
| 1036 |  | 
|---|
| 1037 | xorl	%OFFSET_REG, %OFFSET_REG | 
|---|
| 1038 |  | 
|---|
| 1039 | /* Check which is closer to page cross, s1 or s2.  */ | 
|---|
| 1040 | cmpl	%eax, %ecx | 
|---|
| 1041 | jg	L(page_cross_s2) | 
|---|
| 1042 |  | 
|---|
| 1043 | /* The previous page cross check has false positives. Check for | 
|---|
| 1044 | true positive as page cross logic is very expensive.  */ | 
|---|
| 1045 | subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax | 
|---|
| 1046 | jbe	L(no_page_cross) | 
|---|
| 1047 |  | 
|---|
| 1048 |  | 
|---|
| 1049 | /* Set r8 to not interfere with normal return value (rdi and rsi | 
|---|
| 1050 | did not swap).  */ | 
|---|
| 1051 | # ifdef USE_AS_WCSCMP | 
|---|
| 1052 | /* any non-zero positive value that doesn't inference with 0x1. | 
|---|
| 1053 | */ | 
|---|
| 1054 | movl	$2, %r8d | 
|---|
| 1055 | # else | 
|---|
| 1056 | xorl	%r8d, %r8d | 
|---|
| 1057 | # endif | 
|---|
| 1058 |  | 
|---|
| 1059 | /* Check if less than 1x VEC till page cross.  */ | 
|---|
| 1060 | subl	$(VEC_SIZE * 3), %eax | 
|---|
| 1061 | jg	L(less_1x_vec_till_page) | 
|---|
| 1062 |  | 
|---|
| 1063 |  | 
|---|
| 1064 | /* If more than 1x VEC till page cross, loop throuh safely | 
|---|
| 1065 | loadable memory until within 1x VEC of page cross.  */ | 
|---|
| 1066 | .p2align 4,, 8 | 
|---|
| 1067 | L(page_cross_loop): | 
|---|
| 1068 | VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 | 
|---|
| 1069 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 1070 | CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} | 
|---|
| 1071 | kmovd	%k1, %ecx | 
|---|
| 1072 | TESTEQ	%ecx | 
|---|
| 1073 | jnz	L(check_ret_vec_page_cross) | 
|---|
| 1074 | addl	$CHAR_PER_VEC, %OFFSET_REG | 
|---|
| 1075 | # ifdef USE_AS_STRNCMP | 
|---|
| 1076 | cmpq	%OFFSET_REG64, %rdx | 
|---|
| 1077 | jbe	L(ret_zero_page_cross) | 
|---|
| 1078 | # endif | 
|---|
| 1079 | addl	$VEC_SIZE, %eax | 
|---|
| 1080 | jl	L(page_cross_loop) | 
|---|
| 1081 |  | 
|---|
| 1082 | # ifdef USE_AS_WCSCMP | 
|---|
| 1083 | shrl	$2, %eax | 
|---|
| 1084 | # endif | 
|---|
| 1085 |  | 
|---|
| 1086 |  | 
|---|
| 1087 | subl	%eax, %OFFSET_REG | 
|---|
| 1088 | /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed | 
|---|
| 1089 | to not cross page so is safe to load. Since we have already | 
|---|
| 1090 | loaded at least 1 VEC from rsi it is also guranteed to be safe. | 
|---|
| 1091 | */ | 
|---|
| 1092 | VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 | 
|---|
| 1093 | VPTESTM	%YMM0, %YMM0, %k2 | 
|---|
| 1094 | CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} | 
|---|
| 1095 |  | 
|---|
| 1096 | kmovd	%k1, %ecx | 
|---|
| 1097 | # ifdef USE_AS_STRNCMP | 
|---|
| 1098 | leal	CHAR_PER_VEC(%OFFSET_REG64), %eax | 
|---|
| 1099 | cmpq	%rax, %rdx | 
|---|
| 1100 | jbe	L(check_ret_vec_page_cross2) | 
|---|
| 1101 | #  ifdef USE_AS_WCSCMP | 
|---|
| 1102 | addq	$-(CHAR_PER_VEC * 2), %rdx | 
|---|
| 1103 | #  else | 
|---|
| 1104 | addq	%rdi, %rdx | 
|---|
| 1105 | #  endif | 
|---|
| 1106 | # endif | 
|---|
| 1107 | TESTEQ	%ecx | 
|---|
| 1108 | jz	L(prepare_loop_no_len) | 
|---|
| 1109 |  | 
|---|
| 1110 | .p2align 4,, 4 | 
|---|
| 1111 | L(ret_vec_page_cross): | 
|---|
| 1112 | # ifndef USE_AS_STRNCMP | 
|---|
| 1113 | L(check_ret_vec_page_cross): | 
|---|
| 1114 | # endif | 
|---|
| 1115 | tzcntl	%ecx, %ecx | 
|---|
| 1116 | addl	%OFFSET_REG, %ecx | 
|---|
| 1117 | L(ret_vec_page_cross_cont): | 
|---|
| 1118 | # ifdef USE_AS_WCSCMP | 
|---|
| 1119 | movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 1120 | xorl	%eax, %eax | 
|---|
| 1121 | cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx | 
|---|
| 1122 | je	L(ret12) | 
|---|
| 1123 | setl	%al | 
|---|
| 1124 | negl	%eax | 
|---|
| 1125 | xorl	%r8d, %eax | 
|---|
| 1126 | # else | 
|---|
| 1127 | movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax | 
|---|
| 1128 | movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx | 
|---|
| 1129 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 1130 | TOLOWER_gpr (%rcx, %ecx) | 
|---|
| 1131 | subl	%ecx, %eax | 
|---|
| 1132 | xorl	%r8d, %eax | 
|---|
| 1133 | subl	%r8d, %eax | 
|---|
| 1134 | # endif | 
|---|
| 1135 | L(ret12): | 
|---|
| 1136 | ret | 
|---|
| 1137 |  | 
|---|
| 1138 |  | 
|---|
| 1139 | # ifdef USE_AS_STRNCMP | 
|---|
| 1140 | .p2align 4,, 10 | 
|---|
| 1141 | L(check_ret_vec_page_cross2): | 
|---|
| 1142 | TESTEQ	%ecx | 
|---|
| 1143 | L(check_ret_vec_page_cross): | 
|---|
| 1144 | tzcntl	%ecx, %ecx | 
|---|
| 1145 | addl	%OFFSET_REG, %ecx | 
|---|
| 1146 | cmpq	%rcx, %rdx | 
|---|
| 1147 | ja	L(ret_vec_page_cross_cont) | 
|---|
| 1148 | .p2align 4,, 2 | 
|---|
| 1149 | L(ret_zero_page_cross): | 
|---|
| 1150 | xorl	%eax, %eax | 
|---|
| 1151 | ret | 
|---|
| 1152 | # endif | 
|---|
| 1153 |  | 
|---|
| 1154 | .p2align 4,, 4 | 
|---|
| 1155 | L(page_cross_s2): | 
|---|
| 1156 | /* Ensure this is a true page cross.  */ | 
|---|
| 1157 | subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx | 
|---|
| 1158 | jbe	L(no_page_cross) | 
|---|
| 1159 |  | 
|---|
| 1160 |  | 
|---|
| 1161 | movl	%ecx, %eax | 
|---|
| 1162 | movq	%rdi, %rcx | 
|---|
| 1163 | movq	%rsi, %rdi | 
|---|
| 1164 | movq	%rcx, %rsi | 
|---|
| 1165 |  | 
|---|
| 1166 | /* set r8 to negate return value as rdi and rsi swapped.  */ | 
|---|
| 1167 | # ifdef USE_AS_WCSCMP | 
|---|
| 1168 | movl	$-4, %r8d | 
|---|
| 1169 | # else | 
|---|
| 1170 | movl	$-1, %r8d | 
|---|
| 1171 | # endif | 
|---|
| 1172 | xorl	%OFFSET_REG, %OFFSET_REG | 
|---|
| 1173 |  | 
|---|
| 1174 | /* Check if more than 1x VEC till page cross.  */ | 
|---|
| 1175 | subl	$(VEC_SIZE * 3), %eax | 
|---|
| 1176 | jle	L(page_cross_loop) | 
|---|
| 1177 |  | 
|---|
| 1178 | .p2align 4,, 6 | 
|---|
| 1179 | L(less_1x_vec_till_page): | 
|---|
| 1180 | # ifdef USE_AS_WCSCMP | 
|---|
| 1181 | shrl	$2, %eax | 
|---|
| 1182 | # endif | 
|---|
| 1183 | /* Find largest load size we can use.  */ | 
|---|
| 1184 | cmpl	$(16 / SIZE_OF_CHAR), %eax | 
|---|
| 1185 | ja	L(less_16_till_page) | 
|---|
| 1186 |  | 
|---|
| 1187 | /* Use 16 byte comparison.  */ | 
|---|
| 1188 | vmovdqu	(%rdi), %xmm0 | 
|---|
| 1189 | VPTESTM	%xmm0, %xmm0, %k2 | 
|---|
| 1190 | CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2} | 
|---|
| 1191 | kmovd	%k1, %ecx | 
|---|
| 1192 | # ifdef USE_AS_WCSCMP | 
|---|
| 1193 | subl	$0xf, %ecx | 
|---|
| 1194 | # else | 
|---|
| 1195 | incw	%cx | 
|---|
| 1196 | # endif | 
|---|
| 1197 | jnz	L(check_ret_vec_page_cross) | 
|---|
| 1198 | movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG | 
|---|
| 1199 | # ifdef USE_AS_STRNCMP | 
|---|
| 1200 | cmpq	%OFFSET_REG64, %rdx | 
|---|
| 1201 | jbe	L(ret_zero_page_cross_slow_case0) | 
|---|
| 1202 | subl	%eax, %OFFSET_REG | 
|---|
| 1203 | # else | 
|---|
| 1204 | /* Explicit check for 16 byte alignment.  */ | 
|---|
| 1205 | subl	%eax, %OFFSET_REG | 
|---|
| 1206 | jz	L(prepare_loop) | 
|---|
| 1207 | # endif | 
|---|
| 1208 | vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 | 
|---|
| 1209 | VPTESTM	%xmm0, %xmm0, %k2 | 
|---|
| 1210 | CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2} | 
|---|
| 1211 | kmovd	%k1, %ecx | 
|---|
| 1212 | # ifdef USE_AS_WCSCMP | 
|---|
| 1213 | subl	$0xf, %ecx | 
|---|
| 1214 | # else | 
|---|
| 1215 | incw	%cx | 
|---|
| 1216 | # endif | 
|---|
| 1217 | jnz	L(check_ret_vec_page_cross) | 
|---|
| 1218 | # ifdef USE_AS_STRNCMP | 
|---|
| 1219 | addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG | 
|---|
| 1220 | subq	%OFFSET_REG64, %rdx | 
|---|
| 1221 | jbe	L(ret_zero_page_cross_slow_case0) | 
|---|
| 1222 | subq	$-(CHAR_PER_VEC * 4), %rdx | 
|---|
| 1223 |  | 
|---|
| 1224 | leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi | 
|---|
| 1225 | leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi | 
|---|
| 1226 | # else | 
|---|
| 1227 | leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi | 
|---|
| 1228 | leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi | 
|---|
| 1229 | # endif | 
|---|
| 1230 | jmp	L(prepare_loop_aligned) | 
|---|
| 1231 |  | 
|---|
| 1232 | # ifdef USE_AS_STRNCMP | 
|---|
| 1233 | .p2align 4,, 2 | 
|---|
| 1234 | L(ret_zero_page_cross_slow_case0): | 
|---|
| 1235 | xorl	%eax, %eax | 
|---|
| 1236 | ret | 
|---|
| 1237 | # endif | 
|---|
| 1238 |  | 
|---|
| 1239 |  | 
|---|
| 1240 | .p2align 4,, 10 | 
|---|
| 1241 | L(less_16_till_page): | 
|---|
| 1242 | cmpl	$(24 / SIZE_OF_CHAR), %eax | 
|---|
| 1243 | ja	L(less_8_till_page) | 
|---|
| 1244 |  | 
|---|
| 1245 | /* Use 8 byte comparison.  */ | 
|---|
| 1246 | vmovq	(%rdi), %xmm0 | 
|---|
| 1247 | vmovq	(%rsi), %xmm1 | 
|---|
| 1248 | VPTESTM	%xmm0, %xmm0, %k2 | 
|---|
| 1249 | CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} | 
|---|
| 1250 | kmovd	%k1, %ecx | 
|---|
| 1251 | # ifdef USE_AS_WCSCMP | 
|---|
| 1252 | subl	$0x3, %ecx | 
|---|
| 1253 | # else | 
|---|
| 1254 | incb	%cl | 
|---|
| 1255 | # endif | 
|---|
| 1256 | jnz	L(check_ret_vec_page_cross) | 
|---|
| 1257 |  | 
|---|
| 1258 |  | 
|---|
| 1259 | # ifdef USE_AS_STRNCMP | 
|---|
| 1260 | cmpq	$(8 / SIZE_OF_CHAR), %rdx | 
|---|
| 1261 | jbe	L(ret_zero_page_cross_slow_case0) | 
|---|
| 1262 | # endif | 
|---|
| 1263 | movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG | 
|---|
| 1264 | subl	%eax, %OFFSET_REG | 
|---|
| 1265 |  | 
|---|
| 1266 | vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 | 
|---|
| 1267 | vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 | 
|---|
| 1268 | VPTESTM	%xmm0, %xmm0, %k2 | 
|---|
| 1269 | CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} | 
|---|
| 1270 | kmovd	%k1, %ecx | 
|---|
| 1271 | # ifdef USE_AS_WCSCMP | 
|---|
| 1272 | subl	$0x3, %ecx | 
|---|
| 1273 | # else | 
|---|
| 1274 | incb	%cl | 
|---|
| 1275 | # endif | 
|---|
| 1276 | jnz	L(check_ret_vec_page_cross) | 
|---|
| 1277 |  | 
|---|
| 1278 |  | 
|---|
| 1279 | # ifdef USE_AS_STRNCMP | 
|---|
| 1280 | addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG | 
|---|
| 1281 | subq	%OFFSET_REG64, %rdx | 
|---|
| 1282 | jbe	L(ret_zero_page_cross_slow_case0) | 
|---|
| 1283 | subq	$-(CHAR_PER_VEC * 4), %rdx | 
|---|
| 1284 |  | 
|---|
| 1285 | leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi | 
|---|
| 1286 | leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi | 
|---|
| 1287 | # else | 
|---|
| 1288 | leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi | 
|---|
| 1289 | leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi | 
|---|
| 1290 | # endif | 
|---|
| 1291 | jmp	L(prepare_loop_aligned) | 
|---|
| 1292 |  | 
|---|
| 1293 |  | 
|---|
| 1294 |  | 
|---|
| 1295 |  | 
|---|
| 1296 | .p2align 4,, 10 | 
|---|
| 1297 | L(less_8_till_page): | 
|---|
| 1298 | # ifdef USE_AS_WCSCMP | 
|---|
| 1299 | /* If using wchar then this is the only check before we reach | 
|---|
| 1300 | the page boundary.  */ | 
|---|
| 1301 | movl	(%rdi), %eax | 
|---|
| 1302 | movl	(%rsi), %ecx | 
|---|
| 1303 | cmpl	%ecx, %eax | 
|---|
| 1304 | jnz	L(ret_less_8_wcs) | 
|---|
| 1305 | #  ifdef USE_AS_STRNCMP | 
|---|
| 1306 | addq	$-(CHAR_PER_VEC * 2), %rdx | 
|---|
| 1307 | /* We already checked for len <= 1 so cannot hit that case here. | 
|---|
| 1308 | */ | 
|---|
| 1309 | #  endif | 
|---|
| 1310 | testl	%eax, %eax | 
|---|
| 1311 | jnz	L(prepare_loop) | 
|---|
| 1312 | ret | 
|---|
| 1313 |  | 
|---|
| 1314 | .p2align 4,, 8 | 
|---|
| 1315 | L(ret_less_8_wcs): | 
|---|
| 1316 | setl	%OFFSET_REG8 | 
|---|
| 1317 | negl	%OFFSET_REG | 
|---|
| 1318 | movl	%OFFSET_REG, %eax | 
|---|
| 1319 | xorl	%r8d, %eax | 
|---|
| 1320 | ret | 
|---|
| 1321 |  | 
|---|
| 1322 | # else | 
|---|
| 1323 | cmpl	$28, %eax | 
|---|
| 1324 | ja	L(less_4_till_page) | 
|---|
| 1325 |  | 
|---|
| 1326 | vmovd	(%rdi), %xmm0 | 
|---|
| 1327 | vmovd	(%rsi), %xmm1 | 
|---|
| 1328 | VPTESTM	%xmm0, %xmm0, %k2 | 
|---|
| 1329 | CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} | 
|---|
| 1330 | kmovd	%k1, %ecx | 
|---|
| 1331 | subl	$0xf, %ecx | 
|---|
| 1332 | jnz	L(check_ret_vec_page_cross) | 
|---|
| 1333 |  | 
|---|
| 1334 | #  ifdef USE_AS_STRNCMP | 
|---|
| 1335 | cmpq	$4, %rdx | 
|---|
| 1336 | jbe	L(ret_zero_page_cross_slow_case1) | 
|---|
| 1337 | #  endif | 
|---|
| 1338 | movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG | 
|---|
| 1339 | subl	%eax, %OFFSET_REG | 
|---|
| 1340 |  | 
|---|
| 1341 | vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 | 
|---|
| 1342 | vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 | 
|---|
| 1343 | VPTESTM	%xmm0, %xmm0, %k2 | 
|---|
| 1344 | CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} | 
|---|
| 1345 | kmovd	%k1, %ecx | 
|---|
| 1346 | subl	$0xf, %ecx | 
|---|
| 1347 | jnz	L(check_ret_vec_page_cross) | 
|---|
| 1348 | #  ifdef USE_AS_STRNCMP | 
|---|
| 1349 | addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG | 
|---|
| 1350 | subq	%OFFSET_REG64, %rdx | 
|---|
| 1351 | jbe	L(ret_zero_page_cross_slow_case1) | 
|---|
| 1352 | subq	$-(CHAR_PER_VEC * 4), %rdx | 
|---|
| 1353 |  | 
|---|
| 1354 | leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi | 
|---|
| 1355 | leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi | 
|---|
| 1356 | #  else | 
|---|
| 1357 | leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi | 
|---|
| 1358 | leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi | 
|---|
| 1359 | #  endif | 
|---|
| 1360 | jmp	L(prepare_loop_aligned) | 
|---|
| 1361 |  | 
|---|
| 1362 |  | 
|---|
| 1363 | #  ifdef USE_AS_STRNCMP | 
|---|
| 1364 | .p2align 4,, 2 | 
|---|
| 1365 | L(ret_zero_page_cross_slow_case1): | 
|---|
| 1366 | xorl	%eax, %eax | 
|---|
| 1367 | ret | 
|---|
| 1368 | #  endif | 
|---|
| 1369 |  | 
|---|
| 1370 | .p2align 4,, 10 | 
|---|
| 1371 | L(less_4_till_page): | 
|---|
| 1372 | subq	%rdi, %rsi | 
|---|
| 1373 | /* Extremely slow byte comparison loop.  */ | 
|---|
| 1374 | L(less_4_loop): | 
|---|
| 1375 | movzbl	(%rdi), %eax | 
|---|
| 1376 | movzbl	(%rsi, %rdi), %ecx | 
|---|
| 1377 | TOLOWER_gpr (%rax, %eax) | 
|---|
| 1378 | TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) | 
|---|
| 1379 | subl	%BYTE_LOOP_REG, %eax | 
|---|
| 1380 | jnz	L(ret_less_4_loop) | 
|---|
| 1381 | testl	%ecx, %ecx | 
|---|
| 1382 | jz	L(ret_zero_4_loop) | 
|---|
| 1383 | #  ifdef USE_AS_STRNCMP | 
|---|
| 1384 | decq	%rdx | 
|---|
| 1385 | jz	L(ret_zero_4_loop) | 
|---|
| 1386 | #  endif | 
|---|
| 1387 | incq	%rdi | 
|---|
| 1388 | /* end condition is reach page boundary (rdi is aligned).  */ | 
|---|
| 1389 | testl	$31, %edi | 
|---|
| 1390 | jnz	L(less_4_loop) | 
|---|
| 1391 | leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi | 
|---|
| 1392 | addq	$-(VEC_SIZE * 4), %rdi | 
|---|
| 1393 | #  ifdef USE_AS_STRNCMP | 
|---|
| 1394 | subq	$-(CHAR_PER_VEC * 4), %rdx | 
|---|
| 1395 | #  endif | 
|---|
| 1396 | jmp	L(prepare_loop_aligned) | 
|---|
| 1397 |  | 
|---|
| 1398 | L(ret_zero_4_loop): | 
|---|
| 1399 | xorl	%eax, %eax | 
|---|
| 1400 | ret | 
|---|
| 1401 | L(ret_less_4_loop): | 
|---|
| 1402 | xorl	%r8d, %eax | 
|---|
| 1403 | subl	%r8d, %eax | 
|---|
| 1404 | ret | 
|---|
| 1405 | # endif | 
|---|
| 1406 | cfi_endproc | 
|---|
| 1407 | .size	STRCMP, .-STRCMP | 
|---|
| 1408 | #endif | 
|---|
| 1409 |  | 
|---|