| 1 | /* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. |
| 2 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <isa-level.h> |
| 20 | |
| 21 | #if ISA_SHOULD_BUILD (4) |
| 22 | |
| 23 | /* Use evex-masked stores for small sizes. Turned off at the |
| 24 | moment. */ |
| 25 | # define USE_EVEX_MASKED_STORE 0 |
| 26 | |
| 27 | |
| 28 | # include <sysdep.h> |
| 29 | # ifndef VEC_SIZE |
| 30 | # include "x86-evex256-vecs.h" |
| 31 | # endif |
| 32 | |
| 33 | |
| 34 | # ifndef STRNCPY |
| 35 | # define STRNCPY __strncpy_evex |
| 36 | # endif |
| 37 | |
| 38 | # ifdef USE_AS_WCSCPY |
| 39 | # define VMOVU_MASK vmovdqu32 |
| 40 | # define VPCMPEQ vpcmpeqd |
| 41 | # define VPMIN vpminud |
| 42 | # define VPTESTN vptestnmd |
| 43 | # define VPTEST vptestmd |
| 44 | # define CHAR_SIZE 4 |
| 45 | |
| 46 | # define REP_MOVS rep movsd |
| 47 | # define REP_STOS rep stosl |
| 48 | |
| 49 | # define USE_WIDE_CHAR |
| 50 | |
| 51 | # else |
| 52 | # define VMOVU_MASK vmovdqu8 |
| 53 | # define VPCMPEQ vpcmpeqb |
| 54 | # define VPMIN vpminub |
| 55 | # define VPTESTN vptestnmb |
| 56 | # define VPTEST vptestmb |
| 57 | # define CHAR_SIZE 1 |
| 58 | |
| 59 | # define REP_MOVS rep movsb |
| 60 | # define REP_STOS rep stosb |
| 61 | # endif |
| 62 | |
| 63 | # include "strncpy-or-cat-overflow-def.h" |
| 64 | |
| 65 | # define PAGE_SIZE 4096 |
| 66 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
| 67 | |
| 68 | # include "reg-macros.h" |
| 69 | |
| 70 | |
| 71 | # define VZERO VMM(7) |
| 72 | # define VZERO_256 VMM_256(7) |
| 73 | # define VZERO_128 VMM_128(7) |
| 74 | |
| 75 | # if VEC_SIZE == 64 |
| 76 | # define VZERO_HALF VZERO_256 |
| 77 | # else |
| 78 | # define VZERO_HALF VZERO_128 |
| 79 | # endif |
| 80 | |
| 81 | .section SECTION(.text), "ax" , @progbits |
| 82 | ENTRY(STRNCPY) |
| 83 | # ifdef __ILP32__ |
| 84 | /* Clear the upper 32 bits. */ |
| 85 | movl %edx, %edx |
| 86 | # endif |
| 87 | /* Filter zero length strings and very long strings. Zero |
| 88 | length strings just return, very long strings are handled by |
| 89 | just running rep stos{b|l} to zero set (which will almost |
| 90 | certainly segfault), if that succeeds then just calling |
| 91 | OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ |
| 92 | # ifdef USE_AS_WCSCPY |
| 93 | decq %rdx |
| 94 | movq %rdx, %rax |
| 95 | /* 56 is end of max supported address space. */ |
| 96 | shr $56, %rax |
| 97 | jnz L(zero_len) |
| 98 | # else |
| 99 | decq %rdx |
| 100 | /* If the flag needs to become `jb` replace `dec` with `sub`. |
| 101 | */ |
| 102 | jl L(zero_len) |
| 103 | # endif |
| 104 | |
| 105 | vpxorq %VZERO_128, %VZERO_128, %VZERO_128 |
| 106 | movl %esi, %eax |
| 107 | andl $(PAGE_SIZE - 1), %eax |
| 108 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
| 109 | ja L(page_cross) |
| 110 | |
| 111 | L(page_cross_continue): |
| 112 | VMOVU (%rsi), %VMM(0) |
| 113 | VPTESTN %VMM(0), %VMM(0), %k0 |
| 114 | KMOV %k0, %VRCX |
| 115 | |
| 116 | /* If no STPCPY just save end ahead of time. */ |
| 117 | # ifndef USE_AS_STPCPY |
| 118 | movq %rdi, %rax |
| 119 | # endif |
| 120 | |
| 121 | |
| 122 | cmpq $(CHAR_PER_VEC), %rdx |
| 123 | |
| 124 | /* If USE_EVEX_MASK_STORE is enabled then we just handle length |
| 125 | <= CHAR_PER_VEC with masked instructions (which have |
| 126 | potential for dramatically bad perf if dst splits a page and |
| 127 | is not in the TLB). */ |
| 128 | # if USE_EVEX_MASKED_STORE |
| 129 | /* `jae` because length rdx is now length - 1. */ |
| 130 | jae L(more_1x_vec) |
| 131 | |
| 132 | /* If there where multiple zero-CHAR matches in the first VEC, |
| 133 | VRCX will be overset but thats fine since any oversets where |
| 134 | at zero-positions anyways. */ |
| 135 | |
| 136 | # ifdef USE_AS_STPCPY |
| 137 | tzcnt %VRCX, %VRAX |
| 138 | cmpl %eax, %edx |
| 139 | cmovb %edx, %eax |
| 140 | # ifdef USE_AS_WCSCPY |
| 141 | adcl $0, %eax |
| 142 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
| 143 | # else |
| 144 | adcq %rdi, %rax |
| 145 | # endif |
| 146 | # endif |
| 147 | dec %VRCX |
| 148 | |
| 149 | /* Zero out all non-zero CHAR's after the first zero match. */ |
| 150 | KMOV %VRCX, %k1 |
| 151 | |
| 152 | /* Use VZERO as destination so this can be reused for |
| 153 | L(zfill_less_vec) (which if jumped to by subsequent logic |
| 154 | will have zerod out VZERO. */ |
| 155 | VMOVU_MASK %VMM(0), %VZERO{%k1}{z} |
| 156 | L(zfill_less_vec): |
| 157 | /* Get mask for what we need to set. */ |
| 158 | incl %edx |
| 159 | mov $-1, %VRCX |
| 160 | bzhi %VRDX, %VRCX, %VRCX |
| 161 | KMOV %VRCX, %k1 |
| 162 | VMOVU_MASK %VZERO, (%rdi){%k1} |
| 163 | ret |
| 164 | |
| 165 | .p2align 4,, 4 |
| 166 | L(zero_len): |
| 167 | cmpq $-1, %rdx |
| 168 | jne L(best_effort_strncpy) |
| 169 | movq %rdi, %rax |
| 170 | ret |
| 171 | |
| 172 | .p2align 4,, 8 |
| 173 | L(more_1x_vec): |
| 174 | # else |
| 175 | /* `jb` because length rdx is now length - 1. */ |
| 176 | jb L(less_1x_vec) |
| 177 | # endif |
| 178 | |
| 179 | |
| 180 | /* This may overset but thats fine because we still need to zero |
| 181 | fill. */ |
| 182 | VMOVU %VMM(0), (%rdi) |
| 183 | |
| 184 | |
| 185 | /* Length must be >= CHAR_PER_VEC so match here means we must |
| 186 | zero-fill. */ |
| 187 | test %VRCX, %VRCX |
| 188 | jnz L(zfill) |
| 189 | |
| 190 | |
| 191 | /* We are going to align rsi here so will need to be able to re- |
| 192 | adjust rdi/rdx afterwords. NB: We filtered out huge lengths |
| 193 | so rsi + rdx * CHAR_SIZE cannot overflow. */ |
| 194 | leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx |
| 195 | subq %rsi, %rdi |
| 196 | andq $-(VEC_SIZE), %rsi |
| 197 | |
| 198 | L(loop_last_4x_vec): |
| 199 | addq %rsi, %rdi |
| 200 | subq %rsi, %rdx |
| 201 | # ifdef USE_AS_WCSCPY |
| 202 | shrq $2, %rdx |
| 203 | # endif |
| 204 | |
| 205 | VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) |
| 206 | VPTESTN %VMM(1), %VMM(1), %k0 |
| 207 | KMOV %k0, %VRCX |
| 208 | |
| 209 | /* -1 because of the `dec %rdx` earlier. */ |
| 210 | cmpq $(CHAR_PER_VEC * 2 - 1), %rdx |
| 211 | ja L(more_2x_vec) |
| 212 | |
| 213 | L(last_2x_vec): |
| 214 | /* This will be need to be computed no matter what. We do it |
| 215 | ahead of time for CHAR_PER_VEC == 64 because we can't adjust |
| 216 | the value of `tzcnt` with a shift. */ |
| 217 | # if CHAR_PER_VEC == 64 |
| 218 | tzcntq %rcx, %rcx |
| 219 | # endif |
| 220 | |
| 221 | cmpl $(CHAR_PER_VEC), %edx |
| 222 | jb L(ret_vec_x1_len) |
| 223 | |
| 224 | /* Seperate logic for CHAR_PER_VEC == 64 because we already did |
| 225 | `tzcnt` on VRCX. */ |
| 226 | # if CHAR_PER_VEC == 64 |
| 227 | /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ |
| 228 | cmpb $CHAR_PER_VEC, %cl |
| 229 | jnz L(ret_vec_x1_no_bsf) |
| 230 | # else |
| 231 | test %VRCX, %VRCX |
| 232 | jnz L(ret_vec_x1) |
| 233 | # endif |
| 234 | |
| 235 | |
| 236 | |
| 237 | VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 |
| 238 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
| 239 | KMOV %k0, %VRCX |
| 240 | |
| 241 | # if CHAR_PER_VEC < 64 |
| 242 | /* This essentiallys adds CHAR_PER_VEC to computed result. */ |
| 243 | shlq $CHAR_PER_VEC, %rcx |
| 244 | # else |
| 245 | tzcntq %rcx, %rcx |
| 246 | addl $CHAR_PER_VEC, %ecx |
| 247 | # endif |
| 248 | |
| 249 | .p2align 4,, 4 |
| 250 | L(ret_vec_x1_len): |
| 251 | /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has |
| 252 | already been done. */ |
| 253 | # if CHAR_PER_VEC < 64 |
| 254 | tzcntq %rcx, %rcx |
| 255 | # endif |
| 256 | cmpl %ecx, %edx |
| 257 | jbe L(ret_vec_x1_len_no_zfill) |
| 258 | /* Fall through (expectation) is copy len < buffer len. */ |
| 259 | VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
| 260 | L(ret_vec_x1_len_no_zfill_mov): |
| 261 | movl %ecx, %edx |
| 262 | # ifdef USE_AS_STPCPY |
| 263 | /* clear flags. */ |
| 264 | xorl %ecx, %ecx |
| 265 | # endif |
| 266 | L(ret_vec_x1_len_no_zfill): |
| 267 | VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
| 268 | VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
| 269 | # ifdef USE_AS_STPCPY |
| 270 | # ifdef USE_AS_WCSCPY |
| 271 | adcq $0, %rdx |
| 272 | leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax |
| 273 | # else |
| 274 | leal (VEC_SIZE)(%rdx), %eax |
| 275 | adcq %rdi, %rax |
| 276 | # endif |
| 277 | # endif |
| 278 | ret |
| 279 | |
| 280 | |
| 281 | .p2align 4,, 10 |
| 282 | L(ret_vec_x1): |
| 283 | bsf %VRCX, %VRCX |
| 284 | L(ret_vec_x1_no_bsf): |
| 285 | VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
| 286 | subl %ecx, %edx |
| 287 | cmpl $CHAR_PER_VEC, %edx |
| 288 | jb L(ret_vec_x1_len_no_zfill_mov) |
| 289 | /* Fall through (expectation) is copy len < buffer len. */ |
| 290 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
| 291 | VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) |
| 292 | # ifdef USE_AS_STPCPY |
| 293 | leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax |
| 294 | # endif |
| 295 | ret |
| 296 | |
| 297 | .p2align 4,, 8 |
| 298 | L(last_4x_vec): |
| 299 | /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl |
| 300 | $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just |
| 301 | using `movzbl`. */ |
| 302 | # if CHAR_PER_VEC == 64 |
| 303 | movzbl %dl, %edx |
| 304 | # else |
| 305 | andl $(CHAR_PER_VEC * 4 - 1), %edx |
| 306 | # endif |
| 307 | VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) |
| 308 | VPTESTN %VMM(1), %VMM(1), %k0 |
| 309 | KMOV %k0, %VRCX |
| 310 | subq $-(VEC_SIZE * 4), %rsi |
| 311 | subq $-(VEC_SIZE * 4), %rdi |
| 312 | cmpl $(CHAR_PER_VEC * 2 - 1), %edx |
| 313 | jbe L(last_2x_vec) |
| 314 | .p2align 4,, 8 |
| 315 | L(more_2x_vec): |
| 316 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
| 317 | test %VRCX, %VRCX |
| 318 | /* Must fill at least 2x VEC. */ |
| 319 | jnz L(zfill_vec1) |
| 320 | |
| 321 | VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) |
| 322 | VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) |
| 323 | VPTESTN %VMM(2), %VMM(2), %k0 |
| 324 | KMOV %k0, %VRCX |
| 325 | test %VRCX, %VRCX |
| 326 | /* Must fill at least 1x VEC. */ |
| 327 | jnz L(zfill_vec2) |
| 328 | |
| 329 | VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) |
| 330 | VPTESTN %VMM(3), %VMM(3), %k0 |
| 331 | KMOV %k0, %VRCX |
| 332 | |
| 333 | /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ |
| 334 | cmpq $(CHAR_PER_VEC * 4 - 1), %rdx |
| 335 | ja L(more_4x_vec) |
| 336 | |
| 337 | subl $(CHAR_PER_VEC * 3), %edx |
| 338 | jb L(ret_vec_x3_len) |
| 339 | |
| 340 | test %VRCX, %VRCX |
| 341 | jnz L(ret_vec_x3) |
| 342 | |
| 343 | VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 |
| 344 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
| 345 | KMOV %k0, %VRCX |
| 346 | tzcnt %VRCX, %VRCX |
| 347 | cmpl %ecx, %edx |
| 348 | jbe L(ret_vec_x4_len_no_zfill) |
| 349 | /* Fall through (expectation) is copy len < buffer len. */ |
| 350 | VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
| 351 | movl %ecx, %edx |
| 352 | L(ret_vec_x4_len_no_zfill): |
| 353 | VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
| 354 | VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
| 355 | # ifdef USE_AS_STPCPY |
| 356 | # ifdef USE_AS_WCSCPY |
| 357 | adcq $0, %rdx |
| 358 | leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax |
| 359 | # else |
| 360 | leal (VEC_SIZE * 4 + 0)(%rdx), %eax |
| 361 | adcq %rdi, %rax |
| 362 | # endif |
| 363 | # endif |
| 364 | ret |
| 365 | |
| 366 | |
| 367 | L(ret_vec_x3_len): |
| 368 | addl $(CHAR_PER_VEC * 1), %edx |
| 369 | tzcnt %VRCX, %VRCX |
| 370 | cmpl %ecx, %edx |
| 371 | jbe L(ret_vec_x3_len_no_zfill) |
| 372 | /* Fall through (expectation) is copy len < buffer len. */ |
| 373 | VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
| 374 | L(ret_vec_x3_len_no_zfill_mov): |
| 375 | movl %ecx, %edx |
| 376 | # ifdef USE_AS_STPCPY |
| 377 | /* clear flags. */ |
| 378 | xorl %ecx, %ecx |
| 379 | # endif |
| 380 | .p2align 4,, 4 |
| 381 | L(ret_vec_x3_len_no_zfill): |
| 382 | VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
| 383 | VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
| 384 | # ifdef USE_AS_STPCPY |
| 385 | # ifdef USE_AS_WCSCPY |
| 386 | adcq $0, %rdx |
| 387 | leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax |
| 388 | # else |
| 389 | leal (VEC_SIZE * 3 + 0)(%rdx), %eax |
| 390 | adcq %rdi, %rax |
| 391 | # endif |
| 392 | # endif |
| 393 | ret |
| 394 | |
| 395 | |
| 396 | .p2align 4,, 8 |
| 397 | L(ret_vec_x3): |
| 398 | bsf %VRCX, %VRCX |
| 399 | VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) |
| 400 | subl %ecx, %edx |
| 401 | jl L(ret_vec_x3_len_no_zfill_mov) |
| 402 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
| 403 | VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) |
| 404 | # ifdef USE_AS_STPCPY |
| 405 | leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax |
| 406 | # endif |
| 407 | ret |
| 408 | |
| 409 | .p2align 4,, 8 |
| 410 | L(more_4x_vec): |
| 411 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
| 412 | test %VRCX, %VRCX |
| 413 | jnz L(zfill_vec3) |
| 414 | |
| 415 | VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) |
| 416 | VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) |
| 417 | VPTESTN %VMM(4), %VMM(4), %k0 |
| 418 | KMOV %k0, %VRCX |
| 419 | test %VRCX, %VRCX |
| 420 | jnz L(zfill_vec4) |
| 421 | |
| 422 | /* Recheck length before aligning. */ |
| 423 | cmpq $(CHAR_PER_VEC * 8 - 1), %rdx |
| 424 | jbe L(last_4x_vec) |
| 425 | |
| 426 | /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ |
| 427 | # ifdef USE_AS_WCSCPY |
| 428 | leaq (%rsi, %rdx, CHAR_SIZE), %rdx |
| 429 | # else |
| 430 | addq %rsi, %rdx |
| 431 | # endif |
| 432 | subq %rsi, %rdi |
| 433 | subq $-(VEC_SIZE * 5), %rsi |
| 434 | andq $(VEC_SIZE * -4), %rsi |
| 435 | |
| 436 | |
| 437 | /* Load first half of the loop before entry. */ |
| 438 | VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) |
| 439 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) |
| 440 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) |
| 441 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) |
| 442 | |
| 443 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
| 444 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
| 445 | VPTESTN %VMM(4), %VMM(4), %k2 |
| 446 | VPTESTN %VMM(6), %VMM(6), %k4 |
| 447 | |
| 448 | |
| 449 | /* Offset rsi by VEC_SIZE so that we can jump to |
| 450 | L(loop_last_4x_vec). */ |
| 451 | addq $-(VEC_SIZE), %rsi |
| 452 | KORTEST %k2, %k4 |
| 453 | jnz L(loop_4x_done) |
| 454 | |
| 455 | /* Store loop end in r9. */ |
| 456 | leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 |
| 457 | |
| 458 | .p2align 4,, 11 |
| 459 | L(loop_4x_vec): |
| 460 | VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) |
| 461 | VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) |
| 462 | VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) |
| 463 | VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) |
| 464 | |
| 465 | subq $(VEC_SIZE * -4), %rsi |
| 466 | cmpq %rsi, %r9 |
| 467 | jbe L(loop_last_4x_vec) |
| 468 | |
| 469 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) |
| 470 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) |
| 471 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) |
| 472 | VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) |
| 473 | |
| 474 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
| 475 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
| 476 | VPTESTN %VMM(4), %VMM(4), %k2 |
| 477 | VPTESTN %VMM(6), %VMM(6), %k4 |
| 478 | KORTEST %k2, %k4 |
| 479 | jz L(loop_4x_vec) |
| 480 | |
| 481 | L(loop_4x_done): |
| 482 | /* Restore rdx (length). */ |
| 483 | subq %rsi, %rdx |
| 484 | # ifdef USE_AS_WCSCPY |
| 485 | shrq $2, %rdx |
| 486 | # endif |
| 487 | VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) |
| 488 | /* Restore rdi (dst). */ |
| 489 | addq %rsi, %rdi |
| 490 | VPTESTN %VMM(0), %VMM(0), %k0 |
| 491 | KMOV %k0, %VRCX |
| 492 | test %VRCX, %VRCX |
| 493 | jnz L(zfill_vec1) |
| 494 | |
| 495 | VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) |
| 496 | KMOV %k2, %VRCX |
| 497 | test %VRCX, %VRCX |
| 498 | jnz L(zfill_vec2) |
| 499 | |
| 500 | VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) |
| 501 | VPTESTN %VMM(2), %VMM(2), %k0 |
| 502 | KMOV %k0, %VRCX |
| 503 | test %VRCX, %VRCX |
| 504 | jnz L(zfill_vec3) |
| 505 | |
| 506 | VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) |
| 507 | KMOV %k4, %VRCX |
| 508 | // Zfill more.... |
| 509 | |
| 510 | .p2align 4,, 4 |
| 511 | L(zfill_vec4): |
| 512 | subq $(VEC_SIZE * -2), %rdi |
| 513 | addq $(CHAR_PER_VEC * -2), %rdx |
| 514 | L(zfill_vec2): |
| 515 | subq $(VEC_SIZE * -2), %rdi |
| 516 | addq $(CHAR_PER_VEC * -1), %rdx |
| 517 | L(zfill): |
| 518 | /* VRCX must be non-zero. */ |
| 519 | bsf %VRCX, %VRCX |
| 520 | |
| 521 | /* Adjust length / dst for zfill. */ |
| 522 | subq %rcx, %rdx |
| 523 | # ifdef USE_AS_WCSCPY |
| 524 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
| 525 | # else |
| 526 | addq %rcx, %rdi |
| 527 | # endif |
| 528 | # ifdef USE_AS_STPCPY |
| 529 | movq %rdi, %rax |
| 530 | # endif |
| 531 | L(zfill_from_page_cross): |
| 532 | |
| 533 | /* From here on out its just memset(rdi, 0, rdx). */ |
| 534 | cmpq $CHAR_PER_VEC, %rdx |
| 535 | jb L(zfill_less_vec) |
| 536 | |
| 537 | L(zfill_more_1x_vec): |
| 538 | VMOVU %VZERO, (%rdi) |
| 539 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 540 | cmpq $(CHAR_PER_VEC * 2 - 1), %rdx |
| 541 | ja L(zfill_more_2x_vec) |
| 542 | L(zfill_done0): |
| 543 | ret |
| 544 | |
| 545 | /* Coming from vec1/vec2 we must be able to zfill at least 2x |
| 546 | VEC. */ |
| 547 | .p2align 4,, 8 |
| 548 | L(zfill_vec3): |
| 549 | subq $(VEC_SIZE * -2), %rdi |
| 550 | addq $(CHAR_PER_VEC * -2), %rdx |
| 551 | .p2align 4,, 2 |
| 552 | L(zfill_vec1): |
| 553 | bsfq %rcx, %rcx |
| 554 | /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. |
| 555 | */ |
| 556 | leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi |
| 557 | subq %rcx, %rdx |
| 558 | # ifdef USE_AS_STPCPY |
| 559 | movq %rdi, %rax |
| 560 | # endif |
| 561 | |
| 562 | |
| 563 | VMOVU %VZERO, (%rdi) |
| 564 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 565 | cmpq $(CHAR_PER_VEC * 2), %rdx |
| 566 | jb L(zfill_done0) |
| 567 | L(zfill_more_2x_vec): |
| 568 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) |
| 569 | VMOVU %VZERO, (VEC_SIZE)(%rdi) |
| 570 | subq $(CHAR_PER_VEC * 4 - 1), %rdx |
| 571 | jbe L(zfill_done) |
| 572 | |
| 573 | # ifdef USE_AS_WCSCPY |
| 574 | leaq (%rdi, %rdx, CHAR_SIZE), %rdx |
| 575 | # else |
| 576 | addq %rdi, %rdx |
| 577 | # endif |
| 578 | |
| 579 | VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) |
| 580 | VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) |
| 581 | |
| 582 | |
| 583 | VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) |
| 584 | VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) |
| 585 | |
| 586 | subq $-(VEC_SIZE * 4), %rdi |
| 587 | cmpq %rdi, %rdx |
| 588 | jbe L(zfill_done) |
| 589 | |
| 590 | /* Align rdi and zfill loop. */ |
| 591 | andq $-(VEC_SIZE), %rdi |
| 592 | .p2align 4,, 12 |
| 593 | L(zfill_loop_4x_vec): |
| 594 | VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) |
| 595 | VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) |
| 596 | VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) |
| 597 | VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) |
| 598 | subq $-(VEC_SIZE * 4), %rdi |
| 599 | cmpq %rdi, %rdx |
| 600 | ja L(zfill_loop_4x_vec) |
| 601 | L(zfill_done): |
| 602 | ret |
| 603 | |
| 604 | |
| 605 | /* Less 1x VEC case if we are not using evex masked store. */ |
| 606 | # if !USE_EVEX_MASKED_STORE |
| 607 | .p2align 4,, 8 |
| 608 | L(copy_1x): |
| 609 | /* Special case for copy 1x. It can be handled quickly and many |
| 610 | buffer sizes have convenient alignment. */ |
| 611 | VMOVU %VMM(0), (%rdi) |
| 612 | /* If no zeros then we are done. */ |
| 613 | testl %ecx, %ecx |
| 614 | jz L(ret_1x_1x) |
| 615 | |
| 616 | /* Need to zfill, not we know that length <= CHAR_PER_VEC so we |
| 617 | only handle the small case here. */ |
| 618 | bsf %VRCX, %VRCX |
| 619 | L(zfill_less_vec_no_bsf): |
| 620 | /* Adjust length / dst then just zfill less_vec. */ |
| 621 | subq %rcx, %rdx |
| 622 | # ifdef USE_AS_WCSCPY |
| 623 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
| 624 | # else |
| 625 | addq %rcx, %rdi |
| 626 | # endif |
| 627 | # ifdef USE_AS_STPCPY |
| 628 | movq %rdi, %rax |
| 629 | # endif |
| 630 | |
| 631 | L(zfill_less_vec): |
| 632 | cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx |
| 633 | jb L(zfill_less_half) |
| 634 | |
| 635 | VMOVU %VZERO_HALF, (%rdi) |
| 636 | VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 637 | ret |
| 638 | # ifdef USE_AS_STPCPY |
| 639 | L(ret_1x_1x): |
| 640 | leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax |
| 641 | ret |
| 642 | # endif |
| 643 | |
| 644 | |
| 645 | # if VEC_SIZE == 64 |
| 646 | .p2align 4,, 4 |
| 647 | L(copy_32_63): |
| 648 | /* Overfill to avoid branches. */ |
| 649 | VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) |
| 650 | VMOVU %VMM_256(0), (%rdi) |
| 651 | VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 652 | |
| 653 | /* We are taking advantage of the fact that to be here we must |
| 654 | be writing null-term as (%rdi, %rcx) we have a byte of lee- |
| 655 | way for overwriting. */ |
| 656 | cmpl %ecx, %edx |
| 657 | ja L(zfill_less_vec_no_bsf) |
| 658 | # ifndef USE_AS_STPCPY |
| 659 | L(ret_1x_1x): |
| 660 | # else |
| 661 | # ifdef USE_AS_WCSCPY |
| 662 | adcq $0, %rdx |
| 663 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
| 664 | # else |
| 665 | movl %edx, %eax |
| 666 | adcq %rdi, %rax |
| 667 | # endif |
| 668 | # endif |
| 669 | ret |
| 670 | # endif |
| 671 | |
| 672 | .p2align 4,, 4 |
| 673 | L(copy_16_31): |
| 674 | /* Overfill to avoid branches. */ |
| 675 | vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 |
| 676 | VMOVU %VMM_128(0), (%rdi) |
| 677 | vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 678 | cmpl %ecx, %edx |
| 679 | |
| 680 | /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then |
| 681 | we have a larger copy block for 32-63 so this is just falls |
| 682 | through to zfill 16-31. If VEC_SIZE == 32 then we check for |
| 683 | full zfill of less 1x VEC. */ |
| 684 | # if VEC_SIZE == 64 |
| 685 | jbe L(ret_16_31) |
| 686 | subl %ecx, %edx |
| 687 | # ifdef USE_AS_WCSCPY |
| 688 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
| 689 | # else |
| 690 | addq %rcx, %rdi |
| 691 | # endif |
| 692 | # ifdef USE_AS_STPCPY |
| 693 | movq %rdi, %rax |
| 694 | # endif |
| 695 | L(zfill_less_half): |
| 696 | L(zfill_less_32): |
| 697 | cmpl $(16 / CHAR_SIZE), %edx |
| 698 | jb L(zfill_less_16) |
| 699 | VMOVU %VZERO_128, (%rdi) |
| 700 | VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 701 | # ifdef USE_AS_STPCPY |
| 702 | ret |
| 703 | # endif |
| 704 | L(ret_16_31): |
| 705 | # ifdef USE_AS_STPCPY |
| 706 | # ifdef USE_AS_WCSCPY |
| 707 | adcq $0, %rdx |
| 708 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
| 709 | # else |
| 710 | movl %edx, %eax |
| 711 | adcq %rdi, %rax |
| 712 | # endif |
| 713 | # endif |
| 714 | ret |
| 715 | # else |
| 716 | /* VEC_SIZE == 32 begins. */ |
| 717 | ja L(zfill_less_vec_no_bsf) |
| 718 | # ifndef USE_AS_STPCPY |
| 719 | L(ret_1x_1x): |
| 720 | # else |
| 721 | # ifdef USE_AS_WCSCPY |
| 722 | adcq $0, %rdx |
| 723 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
| 724 | # else |
| 725 | movl %edx, %eax |
| 726 | adcq %rdi, %rax |
| 727 | # endif |
| 728 | # endif |
| 729 | ret |
| 730 | # endif |
| 731 | |
| 732 | |
| 733 | .p2align 4,, 4 |
| 734 | L(copy_8_15): |
| 735 | /* Overfill to avoid branches. */ |
| 736 | movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi |
| 737 | vmovq %VMM_128(0), (%rdi) |
| 738 | movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 739 | cmpl %ecx, %edx |
| 740 | jbe L(ret_8_15) |
| 741 | subl %ecx, %edx |
| 742 | # ifdef USE_AS_WCSCPY |
| 743 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
| 744 | # else |
| 745 | addq %rcx, %rdi |
| 746 | # endif |
| 747 | # ifdef USE_AS_STPCPY |
| 748 | movq %rdi, %rax |
| 749 | # endif |
| 750 | .p2align 4,, 8 |
| 751 | # if VEC_SIZE == 32 |
| 752 | L(zfill_less_half): |
| 753 | # endif |
| 754 | L(zfill_less_16): |
| 755 | xorl %ecx, %ecx |
| 756 | cmpl $(8 / CHAR_SIZE), %edx |
| 757 | jb L(zfill_less_8) |
| 758 | movq %rcx, (%rdi) |
| 759 | movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
| 760 | # ifndef USE_AS_STPCPY |
| 761 | L(ret_8_15): |
| 762 | # endif |
| 763 | ret |
| 764 | |
| 765 | .p2align 4,, 8 |
| 766 | L(less_1x_vec): |
| 767 | je L(copy_1x) |
| 768 | |
| 769 | /* We will need `tzcnt` result for all other copy sizes. */ |
| 770 | tzcnt %VRCX, %VRCX |
| 771 | # if VEC_SIZE == 64 |
| 772 | cmpl $(32 / CHAR_SIZE), %edx |
| 773 | jae L(copy_32_63) |
| 774 | # endif |
| 775 | |
| 776 | cmpl $(16 / CHAR_SIZE), %edx |
| 777 | jae L(copy_16_31) |
| 778 | |
| 779 | cmpl $(8 / CHAR_SIZE), %edx |
| 780 | jae L(copy_8_15) |
| 781 | # ifdef USE_AS_WCSCPY |
| 782 | testl %ecx, %ecx |
| 783 | jz L(zfill_less_8_set_ret) |
| 784 | |
| 785 | movl (%rsi, %rdx, CHAR_SIZE), %esi |
| 786 | vmovd %VMM_128(0), (%rdi) |
| 787 | movl %esi, (%rdi, %rdx, CHAR_SIZE) |
| 788 | # ifdef USE_AS_STPCPY |
| 789 | cmpl %ecx, %edx |
| 790 | L(ret_8_15): |
| 791 | adcq $0, %rdx |
| 792 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
| 793 | # endif |
| 794 | ret |
| 795 | L(zfill_less_8_set_ret): |
| 796 | xorl %ecx, %ecx |
| 797 | # ifdef USE_AS_STPCPY |
| 798 | movq %rdi, %rax |
| 799 | # endif |
| 800 | L(zfill_less_8): |
| 801 | movl %ecx, (%rdi) |
| 802 | movl %ecx, (%rdi, %rdx, CHAR_SIZE) |
| 803 | ret |
| 804 | # else |
| 805 | cmpl $3, %edx |
| 806 | jb L(copy_0_3) |
| 807 | /* Overfill to avoid branches. */ |
| 808 | movl -3(%rsi, %rdx), %esi |
| 809 | vmovd %VMM_128(0), (%rdi) |
| 810 | movl %esi, -3(%rdi, %rdx) |
| 811 | cmpl %ecx, %edx |
| 812 | jbe L(ret_4_7) |
| 813 | subq %rcx, %rdx |
| 814 | addq %rcx, %rdi |
| 815 | # ifdef USE_AS_STPCPY |
| 816 | movq %rdi, %rax |
| 817 | # endif |
| 818 | xorl %ecx, %ecx |
| 819 | .p2align 4,, 8 |
| 820 | L(zfill_less_8): |
| 821 | cmpl $3, %edx |
| 822 | jb L(zfill_less_3) |
| 823 | movl %ecx, (%rdi) |
| 824 | movl %ecx, -3(%rdi, %rdx) |
| 825 | # ifdef USE_AS_STPCPY |
| 826 | ret |
| 827 | # endif |
| 828 | |
| 829 | L(ret_4_7): |
| 830 | # ifdef USE_AS_STPCPY |
| 831 | L(ret_8_15): |
| 832 | movl %edx, %eax |
| 833 | adcq %rdi, %rax |
| 834 | # endif |
| 835 | ret |
| 836 | |
| 837 | .p2align 4,, 4 |
| 838 | L(zfill_less_3): |
| 839 | testl %edx, %edx |
| 840 | jz L(zfill_1) |
| 841 | movw %cx, (%rdi) |
| 842 | L(zfill_1): |
| 843 | movb %cl, (%rdi, %rdx) |
| 844 | ret |
| 845 | |
| 846 | .p2align 4,, 8 |
| 847 | L(copy_0_3): |
| 848 | vmovd %VMM_128(0), %r8d |
| 849 | testl %edx, %edx |
| 850 | jz L(copy_1) |
| 851 | movw %r8w, (%rdi) |
| 852 | cmpl %ecx, %edx |
| 853 | ja L(zfill_from_1) |
| 854 | movzbl (%rsi, %rdx), %r8d |
| 855 | # ifdef USE_AS_STPCPY |
| 856 | movl %edx, %eax |
| 857 | adcq %rdi, %rax |
| 858 | movb %r8b, (%rdi, %rdx) |
| 859 | ret |
| 860 | # endif |
| 861 | |
| 862 | L(copy_1): |
| 863 | # ifdef USE_AS_STPCPY |
| 864 | movl %edx, %eax |
| 865 | cmpl %ecx, %edx |
| 866 | adcq %rdi, %rax |
| 867 | # endif |
| 868 | # ifdef USE_AS_WCSCPY |
| 869 | vmovd %VMM_128(0), (%rdi) |
| 870 | # else |
| 871 | movb %r8b, (%rdi, %rdx) |
| 872 | # endif |
| 873 | ret |
| 874 | # endif |
| 875 | |
| 876 | |
| 877 | # ifndef USE_AS_WCSCPY |
| 878 | .p2align 4,, 8 |
| 879 | L(zfill_from_1): |
| 880 | # ifdef USE_AS_STPCPY |
| 881 | leaq (%rdi, %rcx), %rax |
| 882 | # endif |
| 883 | movw $0, -1(%rdi, %rdx) |
| 884 | ret |
| 885 | # endif |
| 886 | |
| 887 | .p2align 4,, 4 |
| 888 | L(zero_len): |
| 889 | incq %rdx |
| 890 | jne L(best_effort_strncpy) |
| 891 | movq %rdi, %rax |
| 892 | ret |
| 893 | # endif |
| 894 | |
| 895 | |
| 896 | .p2align 4,, 4 |
| 897 | .p2align 6,, 8 |
| 898 | L(page_cross): |
| 899 | movq %rsi, %rax |
| 900 | andq $(VEC_SIZE * -1), %rax |
| 901 | VPCMPEQ (%rax), %VZERO, %k0 |
| 902 | KMOV %k0, %VRCX |
| 903 | # ifdef USE_AS_WCSCPY |
| 904 | movl %esi, %r8d |
| 905 | shrl $2, %r8d |
| 906 | andl $(CHAR_PER_VEC - 1), %r8d |
| 907 | shrx %VR8, %VRCX, %VRCX |
| 908 | # else |
| 909 | shrx %VRSI, %VRCX, %VRCX |
| 910 | # endif |
| 911 | |
| 912 | /* Compute amount of bytes we checked. */ |
| 913 | subl %esi, %eax |
| 914 | andl $(VEC_SIZE - 1), %eax |
| 915 | # ifdef USE_AS_WCSCPY |
| 916 | shrl $2, %eax |
| 917 | # endif |
| 918 | |
| 919 | /* If rax > rdx then we are finishing the copy at the end of the |
| 920 | page. */ |
| 921 | cmpq %rax, %rdx |
| 922 | jb L(page_cross_small) |
| 923 | |
| 924 | |
| 925 | /* If rcx is non-zero then continue. */ |
| 926 | test %VRCX, %VRCX |
| 927 | jz L(page_cross_continue) |
| 928 | |
| 929 | /* We found zero-CHAR so need to copy then zfill (we know we |
| 930 | didn't cover all of length here). */ |
| 931 | bsf %VRCX, %VRCX |
| 932 | L(movsb_and_zfill): |
| 933 | incl %ecx |
| 934 | subq %rcx, %rdx |
| 935 | # ifdef USE_AS_STPCPY |
| 936 | leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax |
| 937 | # else |
| 938 | movq %rdi, %rax |
| 939 | # endif |
| 940 | |
| 941 | REP_MOVS |
| 942 | # ifdef USE_AS_WCSCPY |
| 943 | movl $0, (%rdi) |
| 944 | # else |
| 945 | movb $0, (%rdi) |
| 946 | # endif |
| 947 | jmp L(zfill_from_page_cross) |
| 948 | |
| 949 | L(page_cross_small): |
| 950 | tzcnt %VRCX, %VRCX |
| 951 | cmpl %ecx, %edx |
| 952 | jbe L(page_cross_copy_only) |
| 953 | |
| 954 | /* Do a zfill of the tail before copying. */ |
| 955 | movq %rdi, %r9 |
| 956 | xorl %eax, %eax |
| 957 | |
| 958 | movl %ecx, %r8d |
| 959 | |
| 960 | subl %ecx, %edx |
| 961 | leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi |
| 962 | movl %edx, %ecx |
| 963 | REP_STOS |
| 964 | movq %r9, %rdi |
| 965 | movl %r8d, %edx |
| 966 | L(page_cross_copy_only): |
| 967 | leal 1(%rdx), %ecx |
| 968 | # ifdef USE_AS_STPCPY |
| 969 | # ifdef USE_AS_WCSCPY |
| 970 | adcl $0, %edx |
| 971 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
| 972 | # else |
| 973 | movl %edx, %eax |
| 974 | adcq %rdi, %rax |
| 975 | # endif |
| 976 | # else |
| 977 | movq %rdi, %rax |
| 978 | # endif |
| 979 | REP_MOVS |
| 980 | ret |
| 981 | |
| 982 | |
| 983 | L(best_effort_strncpy): |
| 984 | movq %rdx, %rcx |
| 985 | xorl %eax, %eax |
| 986 | movq %rdi, %r8 |
| 987 | /* The length is >= 2^63. We very much so expect to segfault at |
| 988 | rep stos. If that doesn't happen then just strcpy to finish. |
| 989 | */ |
| 990 | REP_STOS |
| 991 | movq %r8, %rdi |
| 992 | jmp OVERFLOW_STRCPY |
| 993 | END(STRNCPY) |
| 994 | #endif |
| 995 | |