| 1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. | 
|---|
| 2 | Copyright (C) 2018-2022 Free Software Foundation, Inc. | 
|---|
| 3 | This file is part of the GNU C Library. | 
|---|
| 4 |  | 
|---|
| 5 | The GNU C Library is free software; you can redistribute it and/or | 
|---|
| 6 | modify it under the terms of the GNU Lesser General Public | 
|---|
| 7 | License as published by the Free Software Foundation; either | 
|---|
| 8 | version 2.1 of the License, or (at your option) any later version. | 
|---|
| 9 |  | 
|---|
| 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|---|
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|---|
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|---|
| 13 | Lesser General Public License for more details. | 
|---|
| 14 |  | 
|---|
| 15 | You should have received a copy of the GNU Lesser General Public | 
|---|
| 16 | License along with the GNU C Library; if not, see | 
|---|
| 17 | <https://www.gnu.org/licenses/>.  */ | 
|---|
| 18 |  | 
|---|
| 19 | #if IS_IN (libc) | 
|---|
| 20 |  | 
|---|
| 21 | # include <sysdep.h> | 
|---|
| 22 |  | 
|---|
| 23 | # ifndef STRCMP | 
|---|
| 24 | #  define STRCMP	__strcmp_avx2 | 
|---|
| 25 | # endif | 
|---|
| 26 |  | 
|---|
| 27 | # define PAGE_SIZE	4096 | 
|---|
| 28 |  | 
|---|
| 29 | /* VEC_SIZE = Number of bytes in a ymm register */ | 
|---|
| 30 | # define VEC_SIZE	32 | 
|---|
| 31 |  | 
|---|
| 32 | /* Shift for dividing by (VEC_SIZE * 4).  */ | 
|---|
| 33 | # define DIVIDE_BY_VEC_4_SHIFT	7 | 
|---|
| 34 | # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) | 
|---|
| 35 | #  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) | 
|---|
| 36 | # endif | 
|---|
| 37 |  | 
|---|
| 38 | # ifdef USE_AS_WCSCMP | 
|---|
| 39 | /* Compare packed dwords.  */ | 
|---|
| 40 | #  define VPCMPEQ	vpcmpeqd | 
|---|
| 41 | /* Compare packed dwords and store minimum.  */ | 
|---|
| 42 | #  define VPMINU	vpminud | 
|---|
| 43 | /* 1 dword char == 4 bytes.  */ | 
|---|
| 44 | #  define SIZE_OF_CHAR	4 | 
|---|
| 45 | # else | 
|---|
| 46 | /* Compare packed bytes.  */ | 
|---|
| 47 | #  define VPCMPEQ	vpcmpeqb | 
|---|
| 48 | /* Compare packed bytes and store minimum.  */ | 
|---|
| 49 | #  define VPMINU	vpminub | 
|---|
| 50 | /* 1 byte char == 1 byte.  */ | 
|---|
| 51 | #  define SIZE_OF_CHAR	1 | 
|---|
| 52 | # endif | 
|---|
| 53 |  | 
|---|
| 54 | # ifndef VZEROUPPER | 
|---|
| 55 | #  define VZEROUPPER	vzeroupper | 
|---|
| 56 | # endif | 
|---|
| 57 |  | 
|---|
| 58 | # ifndef SECTION | 
|---|
| 59 | #  define SECTION(p)	p##.avx | 
|---|
| 60 | # endif | 
|---|
| 61 |  | 
|---|
| 62 | /* Warning! | 
|---|
| 63 | wcscmp/wcsncmp have to use SIGNED comparison for elements. | 
|---|
| 64 | strcmp/strncmp have to use UNSIGNED comparison for elements. | 
|---|
| 65 | */ | 
|---|
| 66 |  | 
|---|
| 67 | /* The main idea of the string comparison (byte or dword) using AVX2 | 
|---|
| 68 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on | 
|---|
| 69 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order | 
|---|
| 70 | to check the null char, algorithm keeps the matched bytes/dwords, | 
|---|
| 71 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, | 
|---|
| 72 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and | 
|---|
| 73 | one VPMINU instructions, together with movdqu and testl instructions. | 
|---|
| 74 | Main loop (away from from page boundary) compares 4 vectors are a time, | 
|---|
| 75 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. | 
|---|
| 76 |  | 
|---|
| 77 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic | 
|---|
| 78 | is the same as strcmp, except that an a maximum offset is tracked.  If | 
|---|
| 79 | the maximum offset is reached before a difference is found, zero is | 
|---|
| 80 | returned.  */ | 
|---|
| 81 |  | 
|---|
| 82 | .section SECTION(.text), "ax",@progbits | 
|---|
| 83 | ENTRY (STRCMP) | 
|---|
| 84 | # ifdef USE_AS_STRNCMP | 
|---|
| 85 | /* Check for simple cases (0 or 1) in offset.  */ | 
|---|
| 86 | cmp	$1, %RDX_LP | 
|---|
| 87 | je	L(char0) | 
|---|
| 88 | jb	L(zero) | 
|---|
| 89 | #  ifdef USE_AS_WCSCMP | 
|---|
| 90 | #  ifndef __ILP32__ | 
|---|
| 91 | movq	%rdx, %rcx | 
|---|
| 92 | /* Check if length could overflow when multiplied by | 
|---|
| 93 | sizeof(wchar_t). Checking top 8 bits will cover all potential | 
|---|
| 94 | overflow cases as well as redirect cases where its impossible to | 
|---|
| 95 | length to bound a valid memory region. In these cases just use | 
|---|
| 96 | 'wcscmp'.  */ | 
|---|
| 97 | shrq	$56, %rcx | 
|---|
| 98 | jnz	__wcscmp_avx2 | 
|---|
| 99 | #  endif | 
|---|
| 100 | /* Convert units: from wide to byte char.  */ | 
|---|
| 101 | shl	$2, %RDX_LP | 
|---|
| 102 | #  endif | 
|---|
| 103 | /* Register %r11 tracks the maximum offset.  */ | 
|---|
| 104 | mov	%RDX_LP, %R11_LP | 
|---|
| 105 | # endif | 
|---|
| 106 | movl	%edi, %eax | 
|---|
| 107 | xorl	%edx, %edx | 
|---|
| 108 | /* Make %xmm7 (%ymm7) all zeros in this function.  */ | 
|---|
| 109 | vpxor	%xmm7, %xmm7, %xmm7 | 
|---|
| 110 | orl	%esi, %eax | 
|---|
| 111 | andl	$(PAGE_SIZE - 1), %eax | 
|---|
| 112 | cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax | 
|---|
| 113 | jg	L(cross_page) | 
|---|
| 114 | /* Start comparing 4 vectors.  */ | 
|---|
| 115 | vmovdqu	(%rdi), %ymm1 | 
|---|
| 116 | VPCMPEQ	(%rsi), %ymm1, %ymm0 | 
|---|
| 117 | VPMINU	%ymm1, %ymm0, %ymm0 | 
|---|
| 118 | VPCMPEQ	%ymm7, %ymm0, %ymm0 | 
|---|
| 119 | vpmovmskb %ymm0, %ecx | 
|---|
| 120 | testl	%ecx, %ecx | 
|---|
| 121 | je	L(next_3_vectors) | 
|---|
| 122 | tzcntl	%ecx, %edx | 
|---|
| 123 | # ifdef USE_AS_STRNCMP | 
|---|
| 124 | /* Return 0 if the mismatched index (%rdx) is after the maximum | 
|---|
| 125 | offset (%r11).   */ | 
|---|
| 126 | cmpq	%r11, %rdx | 
|---|
| 127 | jae	L(zero) | 
|---|
| 128 | # endif | 
|---|
| 129 | # ifdef USE_AS_WCSCMP | 
|---|
| 130 | xorl	%eax, %eax | 
|---|
| 131 | movl	(%rdi, %rdx), %ecx | 
|---|
| 132 | cmpl	(%rsi, %rdx), %ecx | 
|---|
| 133 | je	L(return) | 
|---|
| 134 | L(wcscmp_return): | 
|---|
| 135 | setl	%al | 
|---|
| 136 | negl	%eax | 
|---|
| 137 | orl	$1, %eax | 
|---|
| 138 | L(return): | 
|---|
| 139 | # else | 
|---|
| 140 | movzbl	(%rdi, %rdx), %eax | 
|---|
| 141 | movzbl	(%rsi, %rdx), %edx | 
|---|
| 142 | subl	%edx, %eax | 
|---|
| 143 | # endif | 
|---|
| 144 | L(return_vzeroupper): | 
|---|
| 145 | ZERO_UPPER_VEC_REGISTERS_RETURN | 
|---|
| 146 |  | 
|---|
| 147 | .p2align 4 | 
|---|
| 148 | L(return_vec_size): | 
|---|
| 149 | tzcntl	%ecx, %edx | 
|---|
| 150 | # ifdef USE_AS_STRNCMP | 
|---|
| 151 | /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after | 
|---|
| 152 | the maximum offset (%r11).  */ | 
|---|
| 153 | addq	$VEC_SIZE, %rdx | 
|---|
| 154 | cmpq	%r11, %rdx | 
|---|
| 155 | jae	L(zero) | 
|---|
| 156 | #  ifdef USE_AS_WCSCMP | 
|---|
| 157 | xorl	%eax, %eax | 
|---|
| 158 | movl	(%rdi, %rdx), %ecx | 
|---|
| 159 | cmpl	(%rsi, %rdx), %ecx | 
|---|
| 160 | jne	L(wcscmp_return) | 
|---|
| 161 | #  else | 
|---|
| 162 | movzbl	(%rdi, %rdx), %eax | 
|---|
| 163 | movzbl	(%rsi, %rdx), %edx | 
|---|
| 164 | subl	%edx, %eax | 
|---|
| 165 | #  endif | 
|---|
| 166 | # else | 
|---|
| 167 | #  ifdef USE_AS_WCSCMP | 
|---|
| 168 | xorl	%eax, %eax | 
|---|
| 169 | movl	VEC_SIZE(%rdi, %rdx), %ecx | 
|---|
| 170 | cmpl	VEC_SIZE(%rsi, %rdx), %ecx | 
|---|
| 171 | jne	L(wcscmp_return) | 
|---|
| 172 | #  else | 
|---|
| 173 | movzbl	VEC_SIZE(%rdi, %rdx), %eax | 
|---|
| 174 | movzbl	VEC_SIZE(%rsi, %rdx), %edx | 
|---|
| 175 | subl	%edx, %eax | 
|---|
| 176 | #  endif | 
|---|
| 177 | # endif | 
|---|
| 178 | VZEROUPPER_RETURN | 
|---|
| 179 |  | 
|---|
| 180 | .p2align 4 | 
|---|
| 181 | L(return_2_vec_size): | 
|---|
| 182 | tzcntl	%ecx, %edx | 
|---|
| 183 | # ifdef USE_AS_STRNCMP | 
|---|
| 184 | /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is | 
|---|
| 185 | after the maximum offset (%r11).  */ | 
|---|
| 186 | addq	$(VEC_SIZE * 2), %rdx | 
|---|
| 187 | cmpq	%r11, %rdx | 
|---|
| 188 | jae	L(zero) | 
|---|
| 189 | #  ifdef USE_AS_WCSCMP | 
|---|
| 190 | xorl	%eax, %eax | 
|---|
| 191 | movl	(%rdi, %rdx), %ecx | 
|---|
| 192 | cmpl	(%rsi, %rdx), %ecx | 
|---|
| 193 | jne	L(wcscmp_return) | 
|---|
| 194 | #  else | 
|---|
| 195 | movzbl	(%rdi, %rdx), %eax | 
|---|
| 196 | movzbl	(%rsi, %rdx), %edx | 
|---|
| 197 | subl	%edx, %eax | 
|---|
| 198 | #  endif | 
|---|
| 199 | # else | 
|---|
| 200 | #  ifdef USE_AS_WCSCMP | 
|---|
| 201 | xorl	%eax, %eax | 
|---|
| 202 | movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx | 
|---|
| 203 | cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx | 
|---|
| 204 | jne	L(wcscmp_return) | 
|---|
| 205 | #  else | 
|---|
| 206 | movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax | 
|---|
| 207 | movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx | 
|---|
| 208 | subl	%edx, %eax | 
|---|
| 209 | #  endif | 
|---|
| 210 | # endif | 
|---|
| 211 | VZEROUPPER_RETURN | 
|---|
| 212 |  | 
|---|
| 213 | .p2align 4 | 
|---|
| 214 | L(return_3_vec_size): | 
|---|
| 215 | tzcntl	%ecx, %edx | 
|---|
| 216 | # ifdef USE_AS_STRNCMP | 
|---|
| 217 | /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is | 
|---|
| 218 | after the maximum offset (%r11).  */ | 
|---|
| 219 | addq	$(VEC_SIZE * 3), %rdx | 
|---|
| 220 | cmpq	%r11, %rdx | 
|---|
| 221 | jae	L(zero) | 
|---|
| 222 | #  ifdef USE_AS_WCSCMP | 
|---|
| 223 | xorl	%eax, %eax | 
|---|
| 224 | movl	(%rdi, %rdx), %ecx | 
|---|
| 225 | cmpl	(%rsi, %rdx), %ecx | 
|---|
| 226 | jne	L(wcscmp_return) | 
|---|
| 227 | #  else | 
|---|
| 228 | movzbl	(%rdi, %rdx), %eax | 
|---|
| 229 | movzbl	(%rsi, %rdx), %edx | 
|---|
| 230 | subl	%edx, %eax | 
|---|
| 231 | #  endif | 
|---|
| 232 | # else | 
|---|
| 233 | #  ifdef USE_AS_WCSCMP | 
|---|
| 234 | xorl	%eax, %eax | 
|---|
| 235 | movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx | 
|---|
| 236 | cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx | 
|---|
| 237 | jne	L(wcscmp_return) | 
|---|
| 238 | #  else | 
|---|
| 239 | movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax | 
|---|
| 240 | movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx | 
|---|
| 241 | subl	%edx, %eax | 
|---|
| 242 | #  endif | 
|---|
| 243 | # endif | 
|---|
| 244 | VZEROUPPER_RETURN | 
|---|
| 245 |  | 
|---|
| 246 | .p2align 4 | 
|---|
| 247 | L(next_3_vectors): | 
|---|
| 248 | vmovdqu	VEC_SIZE(%rdi), %ymm6 | 
|---|
| 249 | VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3 | 
|---|
| 250 | VPMINU	%ymm6, %ymm3, %ymm3 | 
|---|
| 251 | VPCMPEQ	%ymm7, %ymm3, %ymm3 | 
|---|
| 252 | vpmovmskb %ymm3, %ecx | 
|---|
| 253 | testl	%ecx, %ecx | 
|---|
| 254 | jne	L(return_vec_size) | 
|---|
| 255 | vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5 | 
|---|
| 256 | vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4 | 
|---|
| 257 | vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0 | 
|---|
| 258 | VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 | 
|---|
| 259 | VPMINU	%ymm5, %ymm2, %ymm2 | 
|---|
| 260 | VPCMPEQ	%ymm4, %ymm0, %ymm0 | 
|---|
| 261 | VPCMPEQ	%ymm7, %ymm2, %ymm2 | 
|---|
| 262 | vpmovmskb %ymm2, %ecx | 
|---|
| 263 | testl	%ecx, %ecx | 
|---|
| 264 | jne	L(return_2_vec_size) | 
|---|
| 265 | VPMINU	%ymm4, %ymm0, %ymm0 | 
|---|
| 266 | VPCMPEQ	%ymm7, %ymm0, %ymm0 | 
|---|
| 267 | vpmovmskb %ymm0, %ecx | 
|---|
| 268 | testl	%ecx, %ecx | 
|---|
| 269 | jne	L(return_3_vec_size) | 
|---|
| 270 | L(main_loop_header): | 
|---|
| 271 | leaq	(VEC_SIZE * 4)(%rdi), %rdx | 
|---|
| 272 | movl	$PAGE_SIZE, %ecx | 
|---|
| 273 | /* Align load via RAX.  */ | 
|---|
| 274 | andq	$-(VEC_SIZE * 4), %rdx | 
|---|
| 275 | subq	%rdi, %rdx | 
|---|
| 276 | leaq	(%rdi, %rdx), %rax | 
|---|
| 277 | # ifdef USE_AS_STRNCMP | 
|---|
| 278 | /* Starting from this point, the maximum offset, or simply the | 
|---|
| 279 | 'offset', DECREASES by the same amount when base pointers are | 
|---|
| 280 | moved forward.  Return 0 when: | 
|---|
| 281 | 1) On match: offset <= the matched vector index. | 
|---|
| 282 | 2) On mistmach, offset is before the mistmatched index. | 
|---|
| 283 | */ | 
|---|
| 284 | subq	%rdx, %r11 | 
|---|
| 285 | jbe	L(zero) | 
|---|
| 286 | # endif | 
|---|
| 287 | addq	%rsi, %rdx | 
|---|
| 288 | movq	%rdx, %rsi | 
|---|
| 289 | andl	$(PAGE_SIZE - 1), %esi | 
|---|
| 290 | /* Number of bytes before page crossing.  */ | 
|---|
| 291 | subq	%rsi, %rcx | 
|---|
| 292 | /* Number of VEC_SIZE * 4 blocks before page crossing.  */ | 
|---|
| 293 | shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx | 
|---|
| 294 | /* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */ | 
|---|
| 295 | movl	%ecx, %esi | 
|---|
| 296 | jmp	L(loop_start) | 
|---|
| 297 |  | 
|---|
| 298 | .p2align 4 | 
|---|
| 299 | L(loop): | 
|---|
| 300 | # ifdef USE_AS_STRNCMP | 
|---|
| 301 | /* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease | 
|---|
| 302 | the maximum offset (%r11) by the same amount.  */ | 
|---|
| 303 | subq	$(VEC_SIZE * 4), %r11 | 
|---|
| 304 | jbe	L(zero) | 
|---|
| 305 | # endif | 
|---|
| 306 | addq	$(VEC_SIZE * 4), %rax | 
|---|
| 307 | addq	$(VEC_SIZE * 4), %rdx | 
|---|
| 308 | L(loop_start): | 
|---|
| 309 | testl	%esi, %esi | 
|---|
| 310 | leal	-1(%esi), %esi | 
|---|
| 311 | je	L(loop_cross_page) | 
|---|
| 312 | L(back_to_loop): | 
|---|
| 313 | /* Main loop, comparing 4 vectors are a time.  */ | 
|---|
| 314 | vmovdqa	(%rax), %ymm0 | 
|---|
| 315 | vmovdqa	VEC_SIZE(%rax), %ymm3 | 
|---|
| 316 | VPCMPEQ	(%rdx), %ymm0, %ymm4 | 
|---|
| 317 | VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1 | 
|---|
| 318 | VPMINU	%ymm0, %ymm4, %ymm4 | 
|---|
| 319 | VPMINU	%ymm3, %ymm1, %ymm1 | 
|---|
| 320 | vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2 | 
|---|
| 321 | VPMINU	%ymm1, %ymm4, %ymm0 | 
|---|
| 322 | vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3 | 
|---|
| 323 | VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 | 
|---|
| 324 | VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 | 
|---|
| 325 | VPMINU	%ymm2, %ymm5, %ymm5 | 
|---|
| 326 | VPMINU	%ymm3, %ymm6, %ymm6 | 
|---|
| 327 | VPMINU	%ymm5, %ymm0, %ymm0 | 
|---|
| 328 | VPMINU	%ymm6, %ymm0, %ymm0 | 
|---|
| 329 | VPCMPEQ	%ymm7, %ymm0, %ymm0 | 
|---|
| 330 |  | 
|---|
| 331 | /* Test each mask (32 bits) individually because for VEC_SIZE | 
|---|
| 332 | == 32 is not possible to OR the four masks and keep all bits | 
|---|
| 333 | in a 64-bit integer register, differing from SSE2 strcmp | 
|---|
| 334 | where ORing is possible.  */ | 
|---|
| 335 | vpmovmskb %ymm0, %ecx | 
|---|
| 336 | testl	%ecx, %ecx | 
|---|
| 337 | je	L(loop) | 
|---|
| 338 | VPCMPEQ	%ymm7, %ymm4, %ymm0 | 
|---|
| 339 | vpmovmskb %ymm0, %edi | 
|---|
| 340 | testl	%edi, %edi | 
|---|
| 341 | je	L(test_vec) | 
|---|
| 342 | tzcntl	%edi, %ecx | 
|---|
| 343 | # ifdef USE_AS_STRNCMP | 
|---|
| 344 | cmpq	%rcx, %r11 | 
|---|
| 345 | jbe	L(zero) | 
|---|
| 346 | #  ifdef USE_AS_WCSCMP | 
|---|
| 347 | movq	%rax, %rsi | 
|---|
| 348 | xorl	%eax, %eax | 
|---|
| 349 | movl	(%rsi, %rcx), %edi | 
|---|
| 350 | cmpl	(%rdx, %rcx), %edi | 
|---|
| 351 | jne	L(wcscmp_return) | 
|---|
| 352 | #  else | 
|---|
| 353 | movzbl	(%rax, %rcx), %eax | 
|---|
| 354 | movzbl	(%rdx, %rcx), %edx | 
|---|
| 355 | subl	%edx, %eax | 
|---|
| 356 | #  endif | 
|---|
| 357 | # else | 
|---|
| 358 | #  ifdef USE_AS_WCSCMP | 
|---|
| 359 | movq	%rax, %rsi | 
|---|
| 360 | xorl	%eax, %eax | 
|---|
| 361 | movl	(%rsi, %rcx), %edi | 
|---|
| 362 | cmpl	(%rdx, %rcx), %edi | 
|---|
| 363 | jne	L(wcscmp_return) | 
|---|
| 364 | #  else | 
|---|
| 365 | movzbl	(%rax, %rcx), %eax | 
|---|
| 366 | movzbl	(%rdx, %rcx), %edx | 
|---|
| 367 | subl	%edx, %eax | 
|---|
| 368 | #  endif | 
|---|
| 369 | # endif | 
|---|
| 370 | VZEROUPPER_RETURN | 
|---|
| 371 |  | 
|---|
| 372 | .p2align 4 | 
|---|
| 373 | L(test_vec): | 
|---|
| 374 | # ifdef USE_AS_STRNCMP | 
|---|
| 375 | /* The first vector matched.  Return 0 if the maximum offset | 
|---|
| 376 | (%r11) <= VEC_SIZE.  */ | 
|---|
| 377 | cmpq	$VEC_SIZE, %r11 | 
|---|
| 378 | jbe	L(zero) | 
|---|
| 379 | # endif | 
|---|
| 380 | VPCMPEQ	%ymm7, %ymm1, %ymm1 | 
|---|
| 381 | vpmovmskb %ymm1, %ecx | 
|---|
| 382 | testl	%ecx, %ecx | 
|---|
| 383 | je	L(test_2_vec) | 
|---|
| 384 | tzcntl	%ecx, %edi | 
|---|
| 385 | # ifdef USE_AS_STRNCMP | 
|---|
| 386 | addq	$VEC_SIZE, %rdi | 
|---|
| 387 | cmpq	%rdi, %r11 | 
|---|
| 388 | jbe	L(zero) | 
|---|
| 389 | #  ifdef USE_AS_WCSCMP | 
|---|
| 390 | movq	%rax, %rsi | 
|---|
| 391 | xorl	%eax, %eax | 
|---|
| 392 | movl	(%rsi, %rdi), %ecx | 
|---|
| 393 | cmpl	(%rdx, %rdi), %ecx | 
|---|
| 394 | jne	L(wcscmp_return) | 
|---|
| 395 | #  else | 
|---|
| 396 | movzbl	(%rax, %rdi), %eax | 
|---|
| 397 | movzbl	(%rdx, %rdi), %edx | 
|---|
| 398 | subl	%edx, %eax | 
|---|
| 399 | #  endif | 
|---|
| 400 | # else | 
|---|
| 401 | #  ifdef USE_AS_WCSCMP | 
|---|
| 402 | movq	%rax, %rsi | 
|---|
| 403 | xorl	%eax, %eax | 
|---|
| 404 | movl	VEC_SIZE(%rsi, %rdi), %ecx | 
|---|
| 405 | cmpl	VEC_SIZE(%rdx, %rdi), %ecx | 
|---|
| 406 | jne	L(wcscmp_return) | 
|---|
| 407 | #  else | 
|---|
| 408 | movzbl	VEC_SIZE(%rax, %rdi), %eax | 
|---|
| 409 | movzbl	VEC_SIZE(%rdx, %rdi), %edx | 
|---|
| 410 | subl	%edx, %eax | 
|---|
| 411 | #  endif | 
|---|
| 412 | # endif | 
|---|
| 413 | VZEROUPPER_RETURN | 
|---|
| 414 |  | 
|---|
| 415 | .p2align 4 | 
|---|
| 416 | L(test_2_vec): | 
|---|
| 417 | # ifdef USE_AS_STRNCMP | 
|---|
| 418 | /* The first 2 vectors matched.  Return 0 if the maximum offset | 
|---|
| 419 | (%r11) <= 2 * VEC_SIZE.  */ | 
|---|
| 420 | cmpq	$(VEC_SIZE * 2), %r11 | 
|---|
| 421 | jbe	L(zero) | 
|---|
| 422 | # endif | 
|---|
| 423 | VPCMPEQ	%ymm7, %ymm5, %ymm5 | 
|---|
| 424 | vpmovmskb %ymm5, %ecx | 
|---|
| 425 | testl	%ecx, %ecx | 
|---|
| 426 | je	L(test_3_vec) | 
|---|
| 427 | tzcntl	%ecx, %edi | 
|---|
| 428 | # ifdef USE_AS_STRNCMP | 
|---|
| 429 | addq	$(VEC_SIZE * 2), %rdi | 
|---|
| 430 | cmpq	%rdi, %r11 | 
|---|
| 431 | jbe	L(zero) | 
|---|
| 432 | #  ifdef USE_AS_WCSCMP | 
|---|
| 433 | movq	%rax, %rsi | 
|---|
| 434 | xorl	%eax, %eax | 
|---|
| 435 | movl	(%rsi, %rdi), %ecx | 
|---|
| 436 | cmpl	(%rdx, %rdi), %ecx | 
|---|
| 437 | jne	L(wcscmp_return) | 
|---|
| 438 | #  else | 
|---|
| 439 | movzbl	(%rax, %rdi), %eax | 
|---|
| 440 | movzbl	(%rdx, %rdi), %edx | 
|---|
| 441 | subl	%edx, %eax | 
|---|
| 442 | #  endif | 
|---|
| 443 | # else | 
|---|
| 444 | #  ifdef USE_AS_WCSCMP | 
|---|
| 445 | movq	%rax, %rsi | 
|---|
| 446 | xorl	%eax, %eax | 
|---|
| 447 | movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx | 
|---|
| 448 | cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx | 
|---|
| 449 | jne	L(wcscmp_return) | 
|---|
| 450 | #  else | 
|---|
| 451 | movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax | 
|---|
| 452 | movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx | 
|---|
| 453 | subl	%edx, %eax | 
|---|
| 454 | #  endif | 
|---|
| 455 | # endif | 
|---|
| 456 | VZEROUPPER_RETURN | 
|---|
| 457 |  | 
|---|
| 458 | .p2align 4 | 
|---|
| 459 | L(test_3_vec): | 
|---|
| 460 | # ifdef USE_AS_STRNCMP | 
|---|
| 461 | /* The first 3 vectors matched.  Return 0 if the maximum offset | 
|---|
| 462 | (%r11) <= 3 * VEC_SIZE.  */ | 
|---|
| 463 | cmpq	$(VEC_SIZE * 3), %r11 | 
|---|
| 464 | jbe	L(zero) | 
|---|
| 465 | # endif | 
|---|
| 466 | VPCMPEQ	%ymm7, %ymm6, %ymm6 | 
|---|
| 467 | vpmovmskb %ymm6, %esi | 
|---|
| 468 | tzcntl	%esi, %ecx | 
|---|
| 469 | # ifdef USE_AS_STRNCMP | 
|---|
| 470 | addq	$(VEC_SIZE * 3), %rcx | 
|---|
| 471 | cmpq	%rcx, %r11 | 
|---|
| 472 | jbe	L(zero) | 
|---|
| 473 | #  ifdef USE_AS_WCSCMP | 
|---|
| 474 | movq	%rax, %rsi | 
|---|
| 475 | xorl	%eax, %eax | 
|---|
| 476 | movl	(%rsi, %rcx), %esi | 
|---|
| 477 | cmpl	(%rdx, %rcx), %esi | 
|---|
| 478 | jne	L(wcscmp_return) | 
|---|
| 479 | #  else | 
|---|
| 480 | movzbl	(%rax, %rcx), %eax | 
|---|
| 481 | movzbl	(%rdx, %rcx), %edx | 
|---|
| 482 | subl	%edx, %eax | 
|---|
| 483 | #  endif | 
|---|
| 484 | # else | 
|---|
| 485 | #  ifdef USE_AS_WCSCMP | 
|---|
| 486 | movq	%rax, %rsi | 
|---|
| 487 | xorl	%eax, %eax | 
|---|
| 488 | movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi | 
|---|
| 489 | cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi | 
|---|
| 490 | jne	L(wcscmp_return) | 
|---|
| 491 | #  else | 
|---|
| 492 | movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax | 
|---|
| 493 | movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx | 
|---|
| 494 | subl	%edx, %eax | 
|---|
| 495 | #  endif | 
|---|
| 496 | # endif | 
|---|
| 497 | VZEROUPPER_RETURN | 
|---|
| 498 |  | 
|---|
| 499 | .p2align 4 | 
|---|
| 500 | L(loop_cross_page): | 
|---|
| 501 | xorl	%r10d, %r10d | 
|---|
| 502 | movq	%rdx, %rcx | 
|---|
| 503 | /* Align load via RDX.  We load the extra ECX bytes which should | 
|---|
| 504 | be ignored.  */ | 
|---|
| 505 | andl	$((VEC_SIZE * 4) - 1), %ecx | 
|---|
| 506 | /* R10 is -RCX.  */ | 
|---|
| 507 | subq	%rcx, %r10 | 
|---|
| 508 |  | 
|---|
| 509 | /* This works only if VEC_SIZE * 2 == 64. */ | 
|---|
| 510 | # if (VEC_SIZE * 2) != 64 | 
|---|
| 511 | #  error (VEC_SIZE * 2) != 64 | 
|---|
| 512 | # endif | 
|---|
| 513 |  | 
|---|
| 514 | /* Check if the first VEC_SIZE * 2 bytes should be ignored.  */ | 
|---|
| 515 | cmpl	$(VEC_SIZE * 2), %ecx | 
|---|
| 516 | jge	L(loop_cross_page_2_vec) | 
|---|
| 517 |  | 
|---|
| 518 | vmovdqu	(%rax, %r10), %ymm2 | 
|---|
| 519 | vmovdqu	VEC_SIZE(%rax, %r10), %ymm3 | 
|---|
| 520 | VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0 | 
|---|
| 521 | VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 | 
|---|
| 522 | VPMINU	%ymm2, %ymm0, %ymm0 | 
|---|
| 523 | VPMINU	%ymm3, %ymm1, %ymm1 | 
|---|
| 524 | VPCMPEQ	%ymm7, %ymm0, %ymm0 | 
|---|
| 525 | VPCMPEQ	%ymm7, %ymm1, %ymm1 | 
|---|
| 526 |  | 
|---|
| 527 | vpmovmskb %ymm0, %edi | 
|---|
| 528 | vpmovmskb %ymm1, %esi | 
|---|
| 529 |  | 
|---|
| 530 | salq	$32, %rsi | 
|---|
| 531 | xorq	%rsi, %rdi | 
|---|
| 532 |  | 
|---|
| 533 | /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */ | 
|---|
| 534 | shrq	%cl, %rdi | 
|---|
| 535 |  | 
|---|
| 536 | testq	%rdi, %rdi | 
|---|
| 537 | je	L(loop_cross_page_2_vec) | 
|---|
| 538 | tzcntq	%rdi, %rcx | 
|---|
| 539 | # ifdef USE_AS_STRNCMP | 
|---|
| 540 | cmpq	%rcx, %r11 | 
|---|
| 541 | jbe	L(zero) | 
|---|
| 542 | #  ifdef USE_AS_WCSCMP | 
|---|
| 543 | movq	%rax, %rsi | 
|---|
| 544 | xorl	%eax, %eax | 
|---|
| 545 | movl	(%rsi, %rcx), %edi | 
|---|
| 546 | cmpl	(%rdx, %rcx), %edi | 
|---|
| 547 | jne	L(wcscmp_return) | 
|---|
| 548 | #  else | 
|---|
| 549 | movzbl	(%rax, %rcx), %eax | 
|---|
| 550 | movzbl	(%rdx, %rcx), %edx | 
|---|
| 551 | subl	%edx, %eax | 
|---|
| 552 | #  endif | 
|---|
| 553 | # else | 
|---|
| 554 | #  ifdef USE_AS_WCSCMP | 
|---|
| 555 | movq	%rax, %rsi | 
|---|
| 556 | xorl	%eax, %eax | 
|---|
| 557 | movl	(%rsi, %rcx), %edi | 
|---|
| 558 | cmpl	(%rdx, %rcx), %edi | 
|---|
| 559 | jne	L(wcscmp_return) | 
|---|
| 560 | #  else | 
|---|
| 561 | movzbl	(%rax, %rcx), %eax | 
|---|
| 562 | movzbl	(%rdx, %rcx), %edx | 
|---|
| 563 | subl	%edx, %eax | 
|---|
| 564 | #  endif | 
|---|
| 565 | # endif | 
|---|
| 566 | VZEROUPPER_RETURN | 
|---|
| 567 |  | 
|---|
| 568 | .p2align 4 | 
|---|
| 569 | L(loop_cross_page_2_vec): | 
|---|
| 570 | /* The first VEC_SIZE * 2 bytes match or are ignored.  */ | 
|---|
| 571 | vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2 | 
|---|
| 572 | vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3 | 
|---|
| 573 | VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 | 
|---|
| 574 | VPMINU	%ymm2, %ymm5, %ymm5 | 
|---|
| 575 | VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 | 
|---|
| 576 | VPCMPEQ	%ymm7, %ymm5, %ymm5 | 
|---|
| 577 | VPMINU	%ymm3, %ymm6, %ymm6 | 
|---|
| 578 | VPCMPEQ	%ymm7, %ymm6, %ymm6 | 
|---|
| 579 |  | 
|---|
| 580 | vpmovmskb %ymm5, %edi | 
|---|
| 581 | vpmovmskb %ymm6, %esi | 
|---|
| 582 |  | 
|---|
| 583 | salq	$32, %rsi | 
|---|
| 584 | xorq	%rsi, %rdi | 
|---|
| 585 |  | 
|---|
| 586 | xorl	%r8d, %r8d | 
|---|
| 587 | /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */ | 
|---|
| 588 | subl	$(VEC_SIZE * 2), %ecx | 
|---|
| 589 | jle	1f | 
|---|
| 590 | /* Skip ECX bytes.  */ | 
|---|
| 591 | shrq	%cl, %rdi | 
|---|
| 592 | /* R8 has number of bytes skipped.  */ | 
|---|
| 593 | movl	%ecx, %r8d | 
|---|
| 594 | 1: | 
|---|
| 595 | /* Before jumping back to the loop, set ESI to the number of | 
|---|
| 596 | VEC_SIZE * 4 blocks before page crossing.  */ | 
|---|
| 597 | movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi | 
|---|
| 598 |  | 
|---|
| 599 | testq	%rdi, %rdi | 
|---|
| 600 | # ifdef USE_AS_STRNCMP | 
|---|
| 601 | /* At this point, if %rdi value is 0, it already tested | 
|---|
| 602 | VEC_SIZE*4+%r10 byte starting from %rax. This label | 
|---|
| 603 | checks whether strncmp maximum offset reached or not.  */ | 
|---|
| 604 | je	L(string_nbyte_offset_check) | 
|---|
| 605 | # else | 
|---|
| 606 | je	L(back_to_loop) | 
|---|
| 607 | # endif | 
|---|
| 608 | tzcntq	%rdi, %rcx | 
|---|
| 609 | addq	%r10, %rcx | 
|---|
| 610 | /* Adjust for number of bytes skipped.  */ | 
|---|
| 611 | addq	%r8, %rcx | 
|---|
| 612 | # ifdef USE_AS_STRNCMP | 
|---|
| 613 | addq	$(VEC_SIZE * 2), %rcx | 
|---|
| 614 | subq	%rcx, %r11 | 
|---|
| 615 | jbe	L(zero) | 
|---|
| 616 | #  ifdef USE_AS_WCSCMP | 
|---|
| 617 | movq	%rax, %rsi | 
|---|
| 618 | xorl	%eax, %eax | 
|---|
| 619 | movl	(%rsi, %rcx), %edi | 
|---|
| 620 | cmpl	(%rdx, %rcx), %edi | 
|---|
| 621 | jne	L(wcscmp_return) | 
|---|
| 622 | #  else | 
|---|
| 623 | movzbl	(%rax, %rcx), %eax | 
|---|
| 624 | movzbl	(%rdx, %rcx), %edx | 
|---|
| 625 | subl	%edx, %eax | 
|---|
| 626 | #  endif | 
|---|
| 627 | # else | 
|---|
| 628 | #  ifdef USE_AS_WCSCMP | 
|---|
| 629 | movq	%rax, %rsi | 
|---|
| 630 | xorl	%eax, %eax | 
|---|
| 631 | movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi | 
|---|
| 632 | cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi | 
|---|
| 633 | jne	L(wcscmp_return) | 
|---|
| 634 | #  else | 
|---|
| 635 | movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax | 
|---|
| 636 | movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx | 
|---|
| 637 | subl	%edx, %eax | 
|---|
| 638 | #  endif | 
|---|
| 639 | # endif | 
|---|
| 640 | VZEROUPPER_RETURN | 
|---|
| 641 |  | 
|---|
| 642 | # ifdef USE_AS_STRNCMP | 
|---|
| 643 | L(string_nbyte_offset_check): | 
|---|
| 644 | leaq	(VEC_SIZE * 4)(%r10), %r10 | 
|---|
| 645 | cmpq	%r10, %r11 | 
|---|
| 646 | jbe	L(zero) | 
|---|
| 647 | jmp	L(back_to_loop) | 
|---|
| 648 | # endif | 
|---|
| 649 |  | 
|---|
| 650 | .p2align 4 | 
|---|
| 651 | L(cross_page_loop): | 
|---|
| 652 | /* Check one byte/dword at a time.  */ | 
|---|
| 653 | # ifdef USE_AS_WCSCMP | 
|---|
| 654 | cmpl	%ecx, %eax | 
|---|
| 655 | # else | 
|---|
| 656 | subl	%ecx, %eax | 
|---|
| 657 | # endif | 
|---|
| 658 | jne	L(different) | 
|---|
| 659 | addl	$SIZE_OF_CHAR, %edx | 
|---|
| 660 | cmpl	$(VEC_SIZE * 4), %edx | 
|---|
| 661 | je	L(main_loop_header) | 
|---|
| 662 | # ifdef USE_AS_STRNCMP | 
|---|
| 663 | cmpq	%r11, %rdx | 
|---|
| 664 | jae	L(zero) | 
|---|
| 665 | # endif | 
|---|
| 666 | # ifdef USE_AS_WCSCMP | 
|---|
| 667 | movl	(%rdi, %rdx), %eax | 
|---|
| 668 | movl	(%rsi, %rdx), %ecx | 
|---|
| 669 | # else | 
|---|
| 670 | movzbl	(%rdi, %rdx), %eax | 
|---|
| 671 | movzbl	(%rsi, %rdx), %ecx | 
|---|
| 672 | # endif | 
|---|
| 673 | /* Check null char.  */ | 
|---|
| 674 | testl	%eax, %eax | 
|---|
| 675 | jne	L(cross_page_loop) | 
|---|
| 676 | /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED | 
|---|
| 677 | comparisons.  */ | 
|---|
| 678 | subl	%ecx, %eax | 
|---|
| 679 | # ifndef USE_AS_WCSCMP | 
|---|
| 680 | L(different): | 
|---|
| 681 | # endif | 
|---|
| 682 | VZEROUPPER_RETURN | 
|---|
| 683 |  | 
|---|
| 684 | # ifdef USE_AS_WCSCMP | 
|---|
| 685 | .p2align 4 | 
|---|
| 686 | L(different): | 
|---|
| 687 | /* Use movl to avoid modifying EFLAGS.  */ | 
|---|
| 688 | movl	$0, %eax | 
|---|
| 689 | setl	%al | 
|---|
| 690 | negl	%eax | 
|---|
| 691 | orl	$1, %eax | 
|---|
| 692 | VZEROUPPER_RETURN | 
|---|
| 693 | # endif | 
|---|
| 694 |  | 
|---|
| 695 | # ifdef USE_AS_STRNCMP | 
|---|
| 696 | .p2align 4 | 
|---|
| 697 | L(zero): | 
|---|
| 698 | xorl	%eax, %eax | 
|---|
| 699 | VZEROUPPER_RETURN | 
|---|
| 700 |  | 
|---|
| 701 | .p2align 4 | 
|---|
| 702 | L(char0): | 
|---|
| 703 | #  ifdef USE_AS_WCSCMP | 
|---|
| 704 | xorl	%eax, %eax | 
|---|
| 705 | movl	(%rdi), %ecx | 
|---|
| 706 | cmpl	(%rsi), %ecx | 
|---|
| 707 | jne	L(wcscmp_return) | 
|---|
| 708 | #  else | 
|---|
| 709 | movzbl	(%rsi), %ecx | 
|---|
| 710 | movzbl	(%rdi), %eax | 
|---|
| 711 | subl	%ecx, %eax | 
|---|
| 712 | #  endif | 
|---|
| 713 | VZEROUPPER_RETURN | 
|---|
| 714 | # endif | 
|---|
| 715 |  | 
|---|
| 716 | .p2align 4 | 
|---|
| 717 | L(last_vector): | 
|---|
| 718 | addq	%rdx, %rdi | 
|---|
| 719 | addq	%rdx, %rsi | 
|---|
| 720 | # ifdef USE_AS_STRNCMP | 
|---|
| 721 | subq	%rdx, %r11 | 
|---|
| 722 | # endif | 
|---|
| 723 | tzcntl	%ecx, %edx | 
|---|
| 724 | # ifdef USE_AS_STRNCMP | 
|---|
| 725 | cmpq	%r11, %rdx | 
|---|
| 726 | jae	L(zero) | 
|---|
| 727 | # endif | 
|---|
| 728 | # ifdef USE_AS_WCSCMP | 
|---|
| 729 | xorl	%eax, %eax | 
|---|
| 730 | movl	(%rdi, %rdx), %ecx | 
|---|
| 731 | cmpl	(%rsi, %rdx), %ecx | 
|---|
| 732 | jne	L(wcscmp_return) | 
|---|
| 733 | # else | 
|---|
| 734 | movzbl	(%rdi, %rdx), %eax | 
|---|
| 735 | movzbl	(%rsi, %rdx), %edx | 
|---|
| 736 | subl	%edx, %eax | 
|---|
| 737 | # endif | 
|---|
| 738 | VZEROUPPER_RETURN | 
|---|
| 739 |  | 
|---|
| 740 | /* Comparing on page boundary region requires special treatment: | 
|---|
| 741 | It must done one vector at the time, starting with the wider | 
|---|
| 742 | ymm vector if possible, if not, with xmm. If fetching 16 bytes | 
|---|
| 743 | (xmm) still passes the boundary, byte comparison must be done. | 
|---|
| 744 | */ | 
|---|
| 745 | .p2align 4 | 
|---|
| 746 | L(cross_page): | 
|---|
| 747 | /* Try one ymm vector at a time.  */ | 
|---|
| 748 | cmpl	$(PAGE_SIZE - VEC_SIZE), %eax | 
|---|
| 749 | jg	L(cross_page_1_vector) | 
|---|
| 750 | L(loop_1_vector): | 
|---|
| 751 | vmovdqu	(%rdi, %rdx), %ymm1 | 
|---|
| 752 | VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0 | 
|---|
| 753 | VPMINU	%ymm1, %ymm0, %ymm0 | 
|---|
| 754 | VPCMPEQ	%ymm7, %ymm0, %ymm0 | 
|---|
| 755 | vpmovmskb %ymm0, %ecx | 
|---|
| 756 | testl	%ecx, %ecx | 
|---|
| 757 | jne	L(last_vector) | 
|---|
| 758 |  | 
|---|
| 759 | addl	$VEC_SIZE, %edx | 
|---|
| 760 |  | 
|---|
| 761 | addl	$VEC_SIZE, %eax | 
|---|
| 762 | # ifdef USE_AS_STRNCMP | 
|---|
| 763 | /* Return 0 if the current offset (%rdx) >= the maximum offset | 
|---|
| 764 | (%r11).  */ | 
|---|
| 765 | cmpq	%r11, %rdx | 
|---|
| 766 | jae	L(zero) | 
|---|
| 767 | # endif | 
|---|
| 768 | cmpl	$(PAGE_SIZE - VEC_SIZE), %eax | 
|---|
| 769 | jle	L(loop_1_vector) | 
|---|
| 770 | L(cross_page_1_vector): | 
|---|
| 771 | /* Less than 32 bytes to check, try one xmm vector.  */ | 
|---|
| 772 | cmpl	$(PAGE_SIZE - 16), %eax | 
|---|
| 773 | jg	L(cross_page_1_xmm) | 
|---|
| 774 | vmovdqu	(%rdi, %rdx), %xmm1 | 
|---|
| 775 | VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0 | 
|---|
| 776 | VPMINU	%xmm1, %xmm0, %xmm0 | 
|---|
| 777 | VPCMPEQ	%xmm7, %xmm0, %xmm0 | 
|---|
| 778 | vpmovmskb %xmm0, %ecx | 
|---|
| 779 | testl	%ecx, %ecx | 
|---|
| 780 | jne	L(last_vector) | 
|---|
| 781 |  | 
|---|
| 782 | addl	$16, %edx | 
|---|
| 783 | # ifndef USE_AS_WCSCMP | 
|---|
| 784 | addl	$16, %eax | 
|---|
| 785 | # endif | 
|---|
| 786 | # ifdef USE_AS_STRNCMP | 
|---|
| 787 | /* Return 0 if the current offset (%rdx) >= the maximum offset | 
|---|
| 788 | (%r11).  */ | 
|---|
| 789 | cmpq	%r11, %rdx | 
|---|
| 790 | jae	L(zero) | 
|---|
| 791 | # endif | 
|---|
| 792 |  | 
|---|
| 793 | L(cross_page_1_xmm): | 
|---|
| 794 | # ifndef USE_AS_WCSCMP | 
|---|
| 795 | /* Less than 16 bytes to check, try 8 byte vector.  NB: No need | 
|---|
| 796 | for wcscmp nor wcsncmp since wide char is 4 bytes.   */ | 
|---|
| 797 | cmpl	$(PAGE_SIZE - 8), %eax | 
|---|
| 798 | jg	L(cross_page_8bytes) | 
|---|
| 799 | vmovq	(%rdi, %rdx), %xmm1 | 
|---|
| 800 | vmovq	(%rsi, %rdx), %xmm0 | 
|---|
| 801 | VPCMPEQ	%xmm0, %xmm1, %xmm0 | 
|---|
| 802 | VPMINU	%xmm1, %xmm0, %xmm0 | 
|---|
| 803 | VPCMPEQ	%xmm7, %xmm0, %xmm0 | 
|---|
| 804 | vpmovmskb %xmm0, %ecx | 
|---|
| 805 | /* Only last 8 bits are valid.  */ | 
|---|
| 806 | andl	$0xff, %ecx | 
|---|
| 807 | testl	%ecx, %ecx | 
|---|
| 808 | jne	L(last_vector) | 
|---|
| 809 |  | 
|---|
| 810 | addl	$8, %edx | 
|---|
| 811 | addl	$8, %eax | 
|---|
| 812 | #  ifdef USE_AS_STRNCMP | 
|---|
| 813 | /* Return 0 if the current offset (%rdx) >= the maximum offset | 
|---|
| 814 | (%r11).  */ | 
|---|
| 815 | cmpq	%r11, %rdx | 
|---|
| 816 | jae	L(zero) | 
|---|
| 817 | #  endif | 
|---|
| 818 |  | 
|---|
| 819 | L(cross_page_8bytes): | 
|---|
| 820 | /* Less than 8 bytes to check, try 4 byte vector.  */ | 
|---|
| 821 | cmpl	$(PAGE_SIZE - 4), %eax | 
|---|
| 822 | jg	L(cross_page_4bytes) | 
|---|
| 823 | vmovd	(%rdi, %rdx), %xmm1 | 
|---|
| 824 | vmovd	(%rsi, %rdx), %xmm0 | 
|---|
| 825 | VPCMPEQ	%xmm0, %xmm1, %xmm0 | 
|---|
| 826 | VPMINU	%xmm1, %xmm0, %xmm0 | 
|---|
| 827 | VPCMPEQ	%xmm7, %xmm0, %xmm0 | 
|---|
| 828 | vpmovmskb %xmm0, %ecx | 
|---|
| 829 | /* Only last 4 bits are valid.  */ | 
|---|
| 830 | andl	$0xf, %ecx | 
|---|
| 831 | testl	%ecx, %ecx | 
|---|
| 832 | jne	L(last_vector) | 
|---|
| 833 |  | 
|---|
| 834 | addl	$4, %edx | 
|---|
| 835 | #  ifdef USE_AS_STRNCMP | 
|---|
| 836 | /* Return 0 if the current offset (%rdx) >= the maximum offset | 
|---|
| 837 | (%r11).  */ | 
|---|
| 838 | cmpq	%r11, %rdx | 
|---|
| 839 | jae	L(zero) | 
|---|
| 840 | #  endif | 
|---|
| 841 |  | 
|---|
| 842 | L(cross_page_4bytes): | 
|---|
| 843 | # endif | 
|---|
| 844 | /* Less than 4 bytes to check, try one byte/dword at a time.  */ | 
|---|
| 845 | # ifdef USE_AS_STRNCMP | 
|---|
| 846 | cmpq	%r11, %rdx | 
|---|
| 847 | jae	L(zero) | 
|---|
| 848 | # endif | 
|---|
| 849 | # ifdef USE_AS_WCSCMP | 
|---|
| 850 | movl	(%rdi, %rdx), %eax | 
|---|
| 851 | movl	(%rsi, %rdx), %ecx | 
|---|
| 852 | # else | 
|---|
| 853 | movzbl	(%rdi, %rdx), %eax | 
|---|
| 854 | movzbl	(%rsi, %rdx), %ecx | 
|---|
| 855 | # endif | 
|---|
| 856 | testl	%eax, %eax | 
|---|
| 857 | jne	L(cross_page_loop) | 
|---|
| 858 | subl	%ecx, %eax | 
|---|
| 859 | VZEROUPPER_RETURN | 
|---|
| 860 | END (STRCMP) | 
|---|
| 861 | #endif | 
|---|
| 862 |  | 
|---|