| 1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
| 2 | Copyright (C) 2016-2022 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | /* memmove/memcpy/mempcpy is implemented as: |
| 20 | 1. Use overlapping load and store to avoid branch. |
| 21 | 2. Load all sources into registers and store them together to avoid |
| 22 | possible address overlap between source and destination. |
| 23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |
| 24 | and store them together. |
| 25 | 4. If address of destination > address of source, backward copy |
| 26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |
| 27 | Load the first 4 * VEC and last VEC before the loop and store |
| 28 | them after the loop to support overlapping addresses. |
| 29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |
| 30 | load and aligned store. Load the last 4 * VEC and first VEC |
| 31 | before the loop and store them after the loop to support |
| 32 | overlapping addresses. |
| 33 | 6. On machines with ERMS feature, if size greater than equal or to |
| 34 | __x86_rep_movsb_threshold and less than |
| 35 | __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. |
| 36 | 7. If size >= __x86_shared_non_temporal_threshold and there is no |
| 37 | overlap between destination and source, use non-temporal store |
| 38 | instead of aligned store copying from either 2 or 4 pages at |
| 39 | once. |
| 40 | 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold |
| 41 | and source and destination do not page alias, copy from 2 pages |
| 42 | at once using non-temporal stores. Page aliasing in this case is |
| 43 | considered true if destination's page alignment - sources' page |
| 44 | alignment is less than 8 * VEC_SIZE. |
| 45 | 9. If size >= 16 * __x86_shared_non_temporal_threshold or source |
| 46 | and destination do page alias copy from 4 pages at once using |
| 47 | non-temporal stores. */ |
| 48 | |
| 49 | #include <sysdep.h> |
| 50 | |
| 51 | #ifndef MEMCPY_SYMBOL |
| 52 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
| 53 | #endif |
| 54 | |
| 55 | #ifndef MEMPCPY_SYMBOL |
| 56 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
| 57 | #endif |
| 58 | |
| 59 | #ifndef MEMMOVE_CHK_SYMBOL |
| 60 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
| 61 | #endif |
| 62 | |
| 63 | #ifndef XMM0 |
| 64 | # define XMM0 xmm0 |
| 65 | #endif |
| 66 | |
| 67 | #ifndef YMM0 |
| 68 | # define YMM0 ymm0 |
| 69 | #endif |
| 70 | |
| 71 | #ifndef VZEROUPPER |
| 72 | # if VEC_SIZE > 16 |
| 73 | # define VZEROUPPER vzeroupper |
| 74 | # else |
| 75 | # define VZEROUPPER |
| 76 | # endif |
| 77 | #endif |
| 78 | |
| 79 | /* Whether to align before movsb. Ultimately we want 64 byte |
| 80 | align and not worth it to load 4x VEC for VEC_SIZE == 16. */ |
| 81 | #define ALIGN_MOVSB (VEC_SIZE > 16) |
| 82 | /* Number of bytes to align movsb to. */ |
| 83 | #define MOVSB_ALIGN_TO 64 |
| 84 | |
| 85 | #define SMALL_MOV_SIZE (MOV_SIZE <= 4) |
| 86 | #define LARGE_MOV_SIZE (MOV_SIZE > 4) |
| 87 | |
| 88 | #if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 |
| 89 | # error MOV_SIZE Unknown |
| 90 | #endif |
| 91 | |
| 92 | #if LARGE_MOV_SIZE |
| 93 | # define SMALL_SIZE_OFFSET (4) |
| 94 | #else |
| 95 | # define SMALL_SIZE_OFFSET (0) |
| 96 | #endif |
| 97 | |
| 98 | #ifndef PAGE_SIZE |
| 99 | # define PAGE_SIZE 4096 |
| 100 | #endif |
| 101 | |
| 102 | #if PAGE_SIZE != 4096 |
| 103 | # error Unsupported PAGE_SIZE |
| 104 | #endif |
| 105 | |
| 106 | #ifndef LOG_PAGE_SIZE |
| 107 | # define LOG_PAGE_SIZE 12 |
| 108 | #endif |
| 109 | |
| 110 | #if PAGE_SIZE != (1 << LOG_PAGE_SIZE) |
| 111 | # error Invalid LOG_PAGE_SIZE |
| 112 | #endif |
| 113 | |
| 114 | /* Byte per page for large_memcpy inner loop. */ |
| 115 | #if VEC_SIZE == 64 |
| 116 | # define LARGE_LOAD_SIZE (VEC_SIZE * 2) |
| 117 | #else |
| 118 | # define LARGE_LOAD_SIZE (VEC_SIZE * 4) |
| 119 | #endif |
| 120 | |
| 121 | /* Amount to shift rdx by to compare for memcpy_large_4x. */ |
| 122 | #ifndef LOG_4X_MEMCPY_THRESH |
| 123 | # define LOG_4X_MEMCPY_THRESH 4 |
| 124 | #endif |
| 125 | |
| 126 | /* Avoid short distance rep movsb only with non-SSE vector. */ |
| 127 | #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB |
| 128 | # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) |
| 129 | #else |
| 130 | # define AVOID_SHORT_DISTANCE_REP_MOVSB 0 |
| 131 | #endif |
| 132 | |
| 133 | #ifndef PREFETCH |
| 134 | # define PREFETCH(addr) prefetcht0 addr |
| 135 | #endif |
| 136 | |
| 137 | /* Assume 64-byte prefetch size. */ |
| 138 | #ifndef PREFETCH_SIZE |
| 139 | # define PREFETCH_SIZE 64 |
| 140 | #endif |
| 141 | |
| 142 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |
| 143 | |
| 144 | #if PREFETCH_SIZE == 64 |
| 145 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |
| 146 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
| 147 | PREFETCH ((offset)base) |
| 148 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |
| 149 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
| 150 | PREFETCH ((offset)base); \ |
| 151 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |
| 152 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |
| 153 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
| 154 | PREFETCH ((offset)base); \ |
| 155 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |
| 156 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |
| 157 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |
| 158 | # else |
| 159 | # error Unsupported PREFETCHED_LOAD_SIZE! |
| 160 | # endif |
| 161 | #else |
| 162 | # error Unsupported PREFETCH_SIZE! |
| 163 | #endif |
| 164 | |
| 165 | #if LARGE_LOAD_SIZE == (VEC_SIZE * 2) |
| 166 | # define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ |
| 167 | VMOVU (offset)base, vec0; \ |
| 168 | VMOVU ((offset) + VEC_SIZE)base, vec1; |
| 169 | # define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ |
| 170 | VMOVNT vec0, (offset)base; \ |
| 171 | VMOVNT vec1, ((offset) + VEC_SIZE)base; |
| 172 | #elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) |
| 173 | # define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
| 174 | VMOVU (offset)base, vec0; \ |
| 175 | VMOVU ((offset) + VEC_SIZE)base, vec1; \ |
| 176 | VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ |
| 177 | VMOVU ((offset) + VEC_SIZE * 3)base, vec3; |
| 178 | # define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
| 179 | VMOVNT vec0, (offset)base; \ |
| 180 | VMOVNT vec1, ((offset) + VEC_SIZE)base; \ |
| 181 | VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ |
| 182 | VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; |
| 183 | #else |
| 184 | # error Invalid LARGE_LOAD_SIZE |
| 185 | #endif |
| 186 | |
| 187 | #ifndef SECTION |
| 188 | # error SECTION is not defined! |
| 189 | #endif |
| 190 | |
| 191 | .section SECTION(.text),"ax" ,@progbits |
| 192 | #if defined SHARED && IS_IN (libc) |
| 193 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
| 194 | cmp %RDX_LP, %RCX_LP |
| 195 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 196 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
| 197 | #endif |
| 198 | |
| 199 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
| 200 | mov %RDI_LP, %RAX_LP |
| 201 | add %RDX_LP, %RAX_LP |
| 202 | jmp L(start) |
| 203 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
| 204 | |
| 205 | #if defined SHARED && IS_IN (libc) |
| 206 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
| 207 | cmp %RDX_LP, %RCX_LP |
| 208 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 209 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
| 210 | #endif |
| 211 | |
| 212 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |
| 213 | movq %rdi, %rax |
| 214 | L(start): |
| 215 | # ifdef __ILP32__ |
| 216 | /* Clear the upper 32 bits. */ |
| 217 | movl %edx, %edx |
| 218 | # endif |
| 219 | cmp $VEC_SIZE, %RDX_LP |
| 220 | jb L(less_vec) |
| 221 | /* Load regardless. */ |
| 222 | VMOVU (%rsi), %VEC(0) |
| 223 | cmp $(VEC_SIZE * 2), %RDX_LP |
| 224 | ja L(more_2x_vec) |
| 225 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
| 226 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
| 227 | VMOVU %VEC(0), (%rdi) |
| 228 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
| 229 | #if !(defined USE_MULTIARCH && IS_IN (libc)) |
| 230 | ZERO_UPPER_VEC_REGISTERS_RETURN |
| 231 | #else |
| 232 | VZEROUPPER_RETURN |
| 233 | #endif |
| 234 | #if defined USE_MULTIARCH && IS_IN (libc) |
| 235 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
| 236 | # if VEC_SIZE == 16 |
| 237 | ENTRY (__mempcpy_chk_erms) |
| 238 | cmp %RDX_LP, %RCX_LP |
| 239 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 240 | END (__mempcpy_chk_erms) |
| 241 | |
| 242 | /* Only used to measure performance of REP MOVSB. */ |
| 243 | ENTRY (__mempcpy_erms) |
| 244 | mov %RDI_LP, %RAX_LP |
| 245 | /* Skip zero length. */ |
| 246 | test %RDX_LP, %RDX_LP |
| 247 | jz 2f |
| 248 | add %RDX_LP, %RAX_LP |
| 249 | jmp L(start_movsb) |
| 250 | END (__mempcpy_erms) |
| 251 | |
| 252 | ENTRY (__memmove_chk_erms) |
| 253 | cmp %RDX_LP, %RCX_LP |
| 254 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 255 | END (__memmove_chk_erms) |
| 256 | |
| 257 | ENTRY (__memmove_erms) |
| 258 | movq %rdi, %rax |
| 259 | /* Skip zero length. */ |
| 260 | test %RDX_LP, %RDX_LP |
| 261 | jz 2f |
| 262 | L(start_movsb): |
| 263 | mov %RDX_LP, %RCX_LP |
| 264 | cmp %RSI_LP, %RDI_LP |
| 265 | jb 1f |
| 266 | /* Source == destination is less common. */ |
| 267 | je 2f |
| 268 | lea (%rsi,%rcx), %RDX_LP |
| 269 | cmp %RDX_LP, %RDI_LP |
| 270 | jb L(movsb_backward) |
| 271 | 1: |
| 272 | rep movsb |
| 273 | 2: |
| 274 | ret |
| 275 | L(movsb_backward): |
| 276 | leaq -1(%rdi,%rcx), %rdi |
| 277 | leaq -1(%rsi,%rcx), %rsi |
| 278 | std |
| 279 | rep movsb |
| 280 | cld |
| 281 | ret |
| 282 | END (__memmove_erms) |
| 283 | strong_alias (__memmove_erms, __memcpy_erms) |
| 284 | strong_alias (__memmove_chk_erms, __memcpy_chk_erms) |
| 285 | # endif |
| 286 | |
| 287 | # ifdef SHARED |
| 288 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
| 289 | cmp %RDX_LP, %RCX_LP |
| 290 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 291 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
| 292 | # endif |
| 293 | |
| 294 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
| 295 | mov %RDI_LP, %RAX_LP |
| 296 | add %RDX_LP, %RAX_LP |
| 297 | jmp L(start_erms) |
| 298 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
| 299 | |
| 300 | # ifdef SHARED |
| 301 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
| 302 | cmp %RDX_LP, %RCX_LP |
| 303 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 304 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
| 305 | # endif |
| 306 | |
| 307 | ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) |
| 308 | movq %rdi, %rax |
| 309 | L(start_erms): |
| 310 | # ifdef __ILP32__ |
| 311 | /* Clear the upper 32 bits. */ |
| 312 | movl %edx, %edx |
| 313 | # endif |
| 314 | cmp $VEC_SIZE, %RDX_LP |
| 315 | jb L(less_vec) |
| 316 | /* Load regardless. */ |
| 317 | VMOVU (%rsi), %VEC(0) |
| 318 | cmp $(VEC_SIZE * 2), %RDX_LP |
| 319 | ja L(movsb_more_2x_vec) |
| 320 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. |
| 321 | */ |
| 322 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) |
| 323 | VMOVU %VEC(0), (%rdi) |
| 324 | VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) |
| 325 | L(return): |
| 326 | # if VEC_SIZE > 16 |
| 327 | ZERO_UPPER_VEC_REGISTERS_RETURN |
| 328 | # else |
| 329 | ret |
| 330 | # endif |
| 331 | #endif |
| 332 | |
| 333 | #if LARGE_MOV_SIZE |
| 334 | /* If LARGE_MOV_SIZE this fits in the aligning bytes between the |
| 335 | ENTRY block and L(less_vec). */ |
| 336 | .p2align 4,, 8 |
| 337 | L(between_4_7): |
| 338 | /* From 4 to 7. No branch when size == 4. */ |
| 339 | movl (%rsi), %ecx |
| 340 | movl (%rsi, %rdx), %esi |
| 341 | movl %ecx, (%rdi) |
| 342 | movl %esi, (%rdi, %rdx) |
| 343 | ret |
| 344 | #endif |
| 345 | |
| 346 | .p2align 4 |
| 347 | L(less_vec): |
| 348 | /* Less than 1 VEC. */ |
| 349 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
| 350 | # error Unsupported VEC_SIZE! |
| 351 | #endif |
| 352 | #if VEC_SIZE > 32 |
| 353 | cmpl $32, %edx |
| 354 | jae L(between_32_63) |
| 355 | #endif |
| 356 | #if VEC_SIZE > 16 |
| 357 | cmpl $16, %edx |
| 358 | jae L(between_16_31) |
| 359 | #endif |
| 360 | cmpl $8, %edx |
| 361 | jae L(between_8_15) |
| 362 | #if SMALL_MOV_SIZE |
| 363 | cmpl $4, %edx |
| 364 | #else |
| 365 | subq $4, %rdx |
| 366 | #endif |
| 367 | jae L(between_4_7) |
| 368 | cmpl $(1 - SMALL_SIZE_OFFSET), %edx |
| 369 | jl L(copy_0) |
| 370 | movb (%rsi), %cl |
| 371 | je L(copy_1) |
| 372 | movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi |
| 373 | movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) |
| 374 | L(copy_1): |
| 375 | movb %cl, (%rdi) |
| 376 | L(copy_0): |
| 377 | ret |
| 378 | |
| 379 | #if SMALL_MOV_SIZE |
| 380 | .p2align 4,, 8 |
| 381 | L(between_4_7): |
| 382 | /* From 4 to 7. No branch when size == 4. */ |
| 383 | movl -4(%rsi, %rdx), %ecx |
| 384 | movl (%rsi), %esi |
| 385 | movl %ecx, -4(%rdi, %rdx) |
| 386 | movl %esi, (%rdi) |
| 387 | ret |
| 388 | #endif |
| 389 | |
| 390 | #if VEC_SIZE > 16 |
| 391 | /* From 16 to 31. No branch when size == 16. */ |
| 392 | .p2align 4,, 8 |
| 393 | L(between_16_31): |
| 394 | vmovdqu (%rsi), %xmm0 |
| 395 | vmovdqu -16(%rsi, %rdx), %xmm1 |
| 396 | vmovdqu %xmm0, (%rdi) |
| 397 | vmovdqu %xmm1, -16(%rdi, %rdx) |
| 398 | /* No ymm registers have been touched. */ |
| 399 | ret |
| 400 | #endif |
| 401 | |
| 402 | #if VEC_SIZE > 32 |
| 403 | .p2align 4,, 10 |
| 404 | L(between_32_63): |
| 405 | /* From 32 to 63. No branch when size == 32. */ |
| 406 | VMOVU (%rsi), %YMM0 |
| 407 | VMOVU -32(%rsi, %rdx), %YMM1 |
| 408 | VMOVU %YMM0, (%rdi) |
| 409 | VMOVU %YMM1, -32(%rdi, %rdx) |
| 410 | VZEROUPPER_RETURN |
| 411 | #endif |
| 412 | |
| 413 | .p2align 4,, 10 |
| 414 | L(between_8_15): |
| 415 | /* From 8 to 15. No branch when size == 8. */ |
| 416 | movq -8(%rsi, %rdx), %rcx |
| 417 | movq (%rsi), %rsi |
| 418 | movq %rsi, (%rdi) |
| 419 | movq %rcx, -8(%rdi, %rdx) |
| 420 | ret |
| 421 | |
| 422 | .p2align 4,, 10 |
| 423 | L(last_4x_vec): |
| 424 | /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |
| 425 | |
| 426 | /* VEC(0) and VEC(1) have already been loaded. */ |
| 427 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) |
| 428 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) |
| 429 | VMOVU %VEC(0), (%rdi) |
| 430 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
| 431 | VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) |
| 432 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) |
| 433 | VZEROUPPER_RETURN |
| 434 | |
| 435 | .p2align 4 |
| 436 | #if defined USE_MULTIARCH && IS_IN (libc) |
| 437 | L(movsb_more_2x_vec): |
| 438 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
| 439 | ja L(movsb) |
| 440 | #endif |
| 441 | L(more_2x_vec): |
| 442 | /* More than 2 * VEC and there may be overlap between |
| 443 | destination and source. */ |
| 444 | cmpq $(VEC_SIZE * 8), %rdx |
| 445 | ja L(more_8x_vec) |
| 446 | /* Load VEC(1) regardless. VEC(0) has already been loaded. */ |
| 447 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
| 448 | cmpq $(VEC_SIZE * 4), %rdx |
| 449 | jbe L(last_4x_vec) |
| 450 | /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |
| 451 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| 452 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
| 453 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) |
| 454 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) |
| 455 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) |
| 456 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) |
| 457 | VMOVU %VEC(0), (%rdi) |
| 458 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
| 459 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
| 460 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
| 461 | VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) |
| 462 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) |
| 463 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) |
| 464 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) |
| 465 | VZEROUPPER_RETURN |
| 466 | |
| 467 | .p2align 4,, 4 |
| 468 | L(more_8x_vec): |
| 469 | movq %rdi, %rcx |
| 470 | subq %rsi, %rcx |
| 471 | /* Go to backwards temporal copy if overlap no matter what as |
| 472 | backward REP MOVSB is slow and we don't want to use NT stores if |
| 473 | there is overlap. */ |
| 474 | cmpq %rdx, %rcx |
| 475 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
| 476 | jb L(more_8x_vec_backward_check_nop) |
| 477 | /* Check if non-temporal move candidate. */ |
| 478 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
| 479 | /* Check non-temporal store threshold. */ |
| 480 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
| 481 | ja L(large_memcpy_2x) |
| 482 | #endif |
| 483 | /* To reach this point there cannot be overlap and dst > src. So |
| 484 | check for overlap and src > dst in which case correctness |
| 485 | requires forward copy. Otherwise decide between backward/forward |
| 486 | copy depending on address aliasing. */ |
| 487 | |
| 488 | /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold |
| 489 | but less than __x86_shared_non_temporal_threshold. */ |
| 490 | L(more_8x_vec_check): |
| 491 | /* rcx contains dst - src. Add back length (rdx). */ |
| 492 | leaq (%rcx, %rdx), %r8 |
| 493 | /* If r8 has different sign than rcx then there is overlap so we |
| 494 | must do forward copy. */ |
| 495 | xorq %rcx, %r8 |
| 496 | /* Isolate just sign bit of r8. */ |
| 497 | shrq $63, %r8 |
| 498 | /* Get 4k difference dst - src. */ |
| 499 | andl $(PAGE_SIZE - 256), %ecx |
| 500 | /* If r8 is non-zero must do foward for correctness. Otherwise |
| 501 | if ecx is non-zero there is 4k False Alaising so do backward |
| 502 | copy. */ |
| 503 | addl %r8d, %ecx |
| 504 | jz L(more_8x_vec_backward) |
| 505 | |
| 506 | /* if rdx is greater than __x86_shared_non_temporal_threshold |
| 507 | but there is overlap, or from short distance movsb. */ |
| 508 | L(more_8x_vec_forward): |
| 509 | /* Load first and last 4 * VEC to support overlapping addresses. |
| 510 | */ |
| 511 | |
| 512 | /* First vec was already loaded into VEC(0). */ |
| 513 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |
| 514 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |
| 515 | /* Save begining of dst. */ |
| 516 | movq %rdi, %rcx |
| 517 | /* Align dst to VEC_SIZE - 1. */ |
| 518 | orq $(VEC_SIZE - 1), %rdi |
| 519 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |
| 520 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |
| 521 | |
| 522 | /* Subtract dst from src. Add back after dst aligned. */ |
| 523 | subq %rcx, %rsi |
| 524 | /* Finish aligning dst. */ |
| 525 | incq %rdi |
| 526 | /* Restore src adjusted with new value for aligned dst. */ |
| 527 | addq %rdi, %rsi |
| 528 | /* Store end of buffer minus tail in rdx. */ |
| 529 | leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx |
| 530 | |
| 531 | /* Dont use multi-byte nop to align. */ |
| 532 | .p2align 4,, 11 |
| 533 | L(loop_4x_vec_forward): |
| 534 | /* Copy 4 * VEC a time forward. */ |
| 535 | VMOVU (%rsi), %VEC(1) |
| 536 | VMOVU VEC_SIZE(%rsi), %VEC(2) |
| 537 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) |
| 538 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) |
| 539 | subq $-(VEC_SIZE * 4), %rsi |
| 540 | VMOVA %VEC(1), (%rdi) |
| 541 | VMOVA %VEC(2), VEC_SIZE(%rdi) |
| 542 | VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) |
| 543 | VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) |
| 544 | subq $-(VEC_SIZE * 4), %rdi |
| 545 | cmpq %rdi, %rdx |
| 546 | ja L(loop_4x_vec_forward) |
| 547 | /* Store the last 4 * VEC. */ |
| 548 | VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) |
| 549 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) |
| 550 | VMOVU %VEC(7), VEC_SIZE(%rdx) |
| 551 | VMOVU %VEC(8), (%rdx) |
| 552 | /* Store the first VEC. */ |
| 553 | VMOVU %VEC(0), (%rcx) |
| 554 | /* Keep L(nop_backward) target close to jmp for 2-byte encoding. |
| 555 | */ |
| 556 | L(nop_backward): |
| 557 | VZEROUPPER_RETURN |
| 558 | |
| 559 | .p2align 4,, 8 |
| 560 | L(more_8x_vec_backward_check_nop): |
| 561 | /* rcx contains dst - src. Test for dst == src to skip all of |
| 562 | memmove. */ |
| 563 | testq %rcx, %rcx |
| 564 | jz L(nop_backward) |
| 565 | L(more_8x_vec_backward): |
| 566 | /* Load the first 4 * VEC and last VEC to support overlapping |
| 567 | addresses. */ |
| 568 | |
| 569 | /* First vec was also loaded into VEC(0). */ |
| 570 | VMOVU VEC_SIZE(%rsi), %VEC(5) |
| 571 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |
| 572 | /* Begining of region for 4x backward copy stored in rcx. */ |
| 573 | leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx |
| 574 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |
| 575 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) |
| 576 | /* Subtract dst from src. Add back after dst aligned. */ |
| 577 | subq %rdi, %rsi |
| 578 | /* Align dst. */ |
| 579 | andq $-(VEC_SIZE), %rcx |
| 580 | /* Restore src. */ |
| 581 | addq %rcx, %rsi |
| 582 | |
| 583 | /* Don't use multi-byte nop to align. */ |
| 584 | .p2align 4,, 11 |
| 585 | L(loop_4x_vec_backward): |
| 586 | /* Copy 4 * VEC a time backward. */ |
| 587 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) |
| 588 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| 589 | VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) |
| 590 | VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) |
| 591 | addq $(VEC_SIZE * -4), %rsi |
| 592 | VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) |
| 593 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) |
| 594 | VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) |
| 595 | VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) |
| 596 | addq $(VEC_SIZE * -4), %rcx |
| 597 | cmpq %rcx, %rdi |
| 598 | jb L(loop_4x_vec_backward) |
| 599 | /* Store the first 4 * VEC. */ |
| 600 | VMOVU %VEC(0), (%rdi) |
| 601 | VMOVU %VEC(5), VEC_SIZE(%rdi) |
| 602 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |
| 603 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |
| 604 | /* Store the last VEC. */ |
| 605 | VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) |
| 606 | VZEROUPPER_RETURN |
| 607 | |
| 608 | #if defined USE_MULTIARCH && IS_IN (libc) |
| 609 | /* L(skip_short_movsb_check) is only used with ERMS. Not for |
| 610 | FSRM. */ |
| 611 | .p2align 5,, 16 |
| 612 | # if ALIGN_MOVSB |
| 613 | L(skip_short_movsb_check): |
| 614 | # if MOVSB_ALIGN_TO > VEC_SIZE |
| 615 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
| 616 | # endif |
| 617 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
| 618 | # error Unsupported MOVSB_ALIGN_TO |
| 619 | # endif |
| 620 | /* If CPU does not have FSRM two options for aligning. Align src |
| 621 | if dst and src 4k alias. Otherwise align dst. */ |
| 622 | testl $(PAGE_SIZE - 512), %ecx |
| 623 | jnz L(movsb_align_dst) |
| 624 | /* Fall through. dst and src 4k alias. It's better to align src |
| 625 | here because the bottleneck will be loads dues to the false |
| 626 | dependency on dst. */ |
| 627 | |
| 628 | /* rcx already has dst - src. */ |
| 629 | movq %rcx, %r9 |
| 630 | /* Add src to len. Subtract back after src aligned. -1 because |
| 631 | src is initially aligned to MOVSB_ALIGN_TO - 1. */ |
| 632 | leaq -1(%rsi, %rdx), %rcx |
| 633 | /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ |
| 634 | orq $(MOVSB_ALIGN_TO - 1), %rsi |
| 635 | /* Restore dst and len adjusted with new values for aligned dst. |
| 636 | */ |
| 637 | leaq 1(%rsi, %r9), %rdi |
| 638 | subq %rsi, %rcx |
| 639 | /* Finish aligning src. */ |
| 640 | incq %rsi |
| 641 | |
| 642 | rep movsb |
| 643 | |
| 644 | VMOVU %VEC(0), (%r8) |
| 645 | # if MOVSB_ALIGN_TO > VEC_SIZE |
| 646 | VMOVU %VEC(1), VEC_SIZE(%r8) |
| 647 | # endif |
| 648 | VZEROUPPER_RETURN |
| 649 | # endif |
| 650 | |
| 651 | .p2align 4,, 12 |
| 652 | L(movsb): |
| 653 | movq %rdi, %rcx |
| 654 | subq %rsi, %rcx |
| 655 | /* Go to backwards temporal copy if overlap no matter what as |
| 656 | backward REP MOVSB is slow and we don't want to use NT stores if |
| 657 | there is overlap. */ |
| 658 | cmpq %rdx, %rcx |
| 659 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
| 660 | jb L(more_8x_vec_backward_check_nop) |
| 661 | # if ALIGN_MOVSB |
| 662 | /* Save dest for storing aligning VECs later. */ |
| 663 | movq %rdi, %r8 |
| 664 | # endif |
| 665 | /* If above __x86_rep_movsb_stop_threshold most likely is |
| 666 | candidate for NT moves aswell. */ |
| 667 | cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |
| 668 | jae L(large_memcpy_2x_check) |
| 669 | # if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB |
| 670 | /* Only avoid short movsb if CPU has FSRM. */ |
| 671 | testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
| 672 | jz L(skip_short_movsb_check) |
| 673 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
| 674 | /* Avoid "rep movsb" if RCX, the distance between source and |
| 675 | destination, is N*4GB + [1..63] with N >= 0. */ |
| 676 | |
| 677 | /* ecx contains dst - src. Early check for backward copy |
| 678 | conditions means only case of slow movsb with src = dst + [0, |
| 679 | 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check |
| 680 | for that case. */ |
| 681 | cmpl $-64, %ecx |
| 682 | ja L(more_8x_vec_forward) |
| 683 | # endif |
| 684 | # endif |
| 685 | # if ALIGN_MOVSB |
| 686 | # if MOVSB_ALIGN_TO > VEC_SIZE |
| 687 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
| 688 | # endif |
| 689 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
| 690 | # error Unsupported MOVSB_ALIGN_TO |
| 691 | # endif |
| 692 | /* Fall through means cpu has FSRM. In that case exclusively |
| 693 | align destination. */ |
| 694 | L(movsb_align_dst): |
| 695 | /* Subtract dst from src. Add back after dst aligned. */ |
| 696 | subq %rdi, %rsi |
| 697 | /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ |
| 698 | addq $(MOVSB_ALIGN_TO - 1), %rdi |
| 699 | /* Add dst to len. Subtract back after dst aligned. */ |
| 700 | leaq (%r8, %rdx), %rcx |
| 701 | /* Finish aligning dst. */ |
| 702 | andq $-(MOVSB_ALIGN_TO), %rdi |
| 703 | /* Restore src and len adjusted with new values for aligned dst. |
| 704 | */ |
| 705 | addq %rdi, %rsi |
| 706 | subq %rdi, %rcx |
| 707 | |
| 708 | rep movsb |
| 709 | |
| 710 | /* Store VECs loaded for aligning. */ |
| 711 | VMOVU %VEC(0), (%r8) |
| 712 | # if MOVSB_ALIGN_TO > VEC_SIZE |
| 713 | VMOVU %VEC(1), VEC_SIZE(%r8) |
| 714 | # endif |
| 715 | VZEROUPPER_RETURN |
| 716 | # else /* !ALIGN_MOVSB. */ |
| 717 | L(skip_short_movsb_check): |
| 718 | mov %RDX_LP, %RCX_LP |
| 719 | rep movsb |
| 720 | ret |
| 721 | # endif |
| 722 | #endif |
| 723 | |
| 724 | .p2align 4,, 10 |
| 725 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
| 726 | L(large_memcpy_2x_check): |
| 727 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
| 728 | jb L(more_8x_vec_check) |
| 729 | L(large_memcpy_2x): |
| 730 | /* To reach this point it is impossible for dst > src and |
| 731 | overlap. Remaining to check is src > dst and overlap. rcx |
| 732 | already contains dst - src. Negate rcx to get src - dst. If |
| 733 | length > rcx then there is overlap and forward copy is best. */ |
| 734 | negq %rcx |
| 735 | cmpq %rcx, %rdx |
| 736 | ja L(more_8x_vec_forward) |
| 737 | |
| 738 | /* Cache align destination. First store the first 64 bytes then |
| 739 | adjust alignments. */ |
| 740 | |
| 741 | /* First vec was also loaded into VEC(0). */ |
| 742 | # if VEC_SIZE < 64 |
| 743 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
| 744 | # if VEC_SIZE < 32 |
| 745 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| 746 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
| 747 | # endif |
| 748 | # endif |
| 749 | VMOVU %VEC(0), (%rdi) |
| 750 | # if VEC_SIZE < 64 |
| 751 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
| 752 | # if VEC_SIZE < 32 |
| 753 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
| 754 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
| 755 | # endif |
| 756 | # endif |
| 757 | |
| 758 | /* Adjust source, destination, and size. */ |
| 759 | movq %rdi, %r8 |
| 760 | andq $63, %r8 |
| 761 | /* Get the negative of offset for alignment. */ |
| 762 | subq $64, %r8 |
| 763 | /* Adjust source. */ |
| 764 | subq %r8, %rsi |
| 765 | /* Adjust destination which should be aligned now. */ |
| 766 | subq %r8, %rdi |
| 767 | /* Adjust length. */ |
| 768 | addq %r8, %rdx |
| 769 | |
| 770 | /* Test if source and destination addresses will alias. If they |
| 771 | do the larger pipeline in large_memcpy_4x alleviated the |
| 772 | performance drop. */ |
| 773 | |
| 774 | /* ecx contains -(dst - src). not ecx will return dst - src - 1 |
| 775 | which works for testing aliasing. */ |
| 776 | notl %ecx |
| 777 | testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx |
| 778 | jz L(large_memcpy_4x) |
| 779 | |
| 780 | movq %rdx, %r10 |
| 781 | shrq $LOG_4X_MEMCPY_THRESH, %r10 |
| 782 | cmp __x86_shared_non_temporal_threshold(%rip), %r10 |
| 783 | jae L(large_memcpy_4x) |
| 784 | |
| 785 | /* edx will store remainder size for copying tail. */ |
| 786 | andl $(PAGE_SIZE * 2 - 1), %edx |
| 787 | /* r10 stores outer loop counter. */ |
| 788 | shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 |
| 789 | /* Copy 4x VEC at a time from 2 pages. */ |
| 790 | .p2align 4 |
| 791 | L(loop_large_memcpy_2x_outer): |
| 792 | /* ecx stores inner loop counter. */ |
| 793 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
| 794 | L(loop_large_memcpy_2x_inner): |
| 795 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
| 796 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |
| 797 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
| 798 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) |
| 799 | /* Load vectors from rsi. */ |
| 800 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
| 801 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
| 802 | subq $-LARGE_LOAD_SIZE, %rsi |
| 803 | /* Non-temporal store vectors to rdi. */ |
| 804 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
| 805 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
| 806 | subq $-LARGE_LOAD_SIZE, %rdi |
| 807 | decl %ecx |
| 808 | jnz L(loop_large_memcpy_2x_inner) |
| 809 | addq $PAGE_SIZE, %rdi |
| 810 | addq $PAGE_SIZE, %rsi |
| 811 | decq %r10 |
| 812 | jne L(loop_large_memcpy_2x_outer) |
| 813 | sfence |
| 814 | |
| 815 | /* Check if only last 4 loads are needed. */ |
| 816 | cmpl $(VEC_SIZE * 4), %edx |
| 817 | jbe L(large_memcpy_2x_end) |
| 818 | |
| 819 | /* Handle the last 2 * PAGE_SIZE bytes. */ |
| 820 | L(loop_large_memcpy_2x_tail): |
| 821 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
| 822 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
| 823 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
| 824 | VMOVU (%rsi), %VEC(0) |
| 825 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
| 826 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| 827 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
| 828 | subq $-(VEC_SIZE * 4), %rsi |
| 829 | addl $-(VEC_SIZE * 4), %edx |
| 830 | VMOVA %VEC(0), (%rdi) |
| 831 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
| 832 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
| 833 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
| 834 | subq $-(VEC_SIZE * 4), %rdi |
| 835 | cmpl $(VEC_SIZE * 4), %edx |
| 836 | ja L(loop_large_memcpy_2x_tail) |
| 837 | |
| 838 | L(large_memcpy_2x_end): |
| 839 | /* Store the last 4 * VEC. */ |
| 840 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
| 841 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
| 842 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
| 843 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
| 844 | |
| 845 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
| 846 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
| 847 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
| 848 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
| 849 | VZEROUPPER_RETURN |
| 850 | |
| 851 | .p2align 4 |
| 852 | L(large_memcpy_4x): |
| 853 | movq %rdx, %r10 |
| 854 | /* edx will store remainder size for copying tail. */ |
| 855 | andl $(PAGE_SIZE * 4 - 1), %edx |
| 856 | /* r10 stores outer loop counter. */ |
| 857 | shrq $(LOG_PAGE_SIZE + 2), %r10 |
| 858 | /* Copy 4x VEC at a time from 4 pages. */ |
| 859 | .p2align 4 |
| 860 | L(loop_large_memcpy_4x_outer): |
| 861 | /* ecx stores inner loop counter. */ |
| 862 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
| 863 | L(loop_large_memcpy_4x_inner): |
| 864 | /* Only one prefetch set per page as doing 4 pages give more |
| 865 | time for prefetcher to keep up. */ |
| 866 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
| 867 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
| 868 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) |
| 869 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) |
| 870 | /* Load vectors from rsi. */ |
| 871 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
| 872 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
| 873 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
| 874 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
| 875 | subq $-LARGE_LOAD_SIZE, %rsi |
| 876 | /* Non-temporal store vectors to rdi. */ |
| 877 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
| 878 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
| 879 | STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
| 880 | STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
| 881 | subq $-LARGE_LOAD_SIZE, %rdi |
| 882 | decl %ecx |
| 883 | jnz L(loop_large_memcpy_4x_inner) |
| 884 | addq $(PAGE_SIZE * 3), %rdi |
| 885 | addq $(PAGE_SIZE * 3), %rsi |
| 886 | decq %r10 |
| 887 | jne L(loop_large_memcpy_4x_outer) |
| 888 | sfence |
| 889 | /* Check if only last 4 loads are needed. */ |
| 890 | cmpl $(VEC_SIZE * 4), %edx |
| 891 | jbe L(large_memcpy_4x_end) |
| 892 | |
| 893 | /* Handle the last 4 * PAGE_SIZE bytes. */ |
| 894 | L(loop_large_memcpy_4x_tail): |
| 895 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
| 896 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
| 897 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
| 898 | VMOVU (%rsi), %VEC(0) |
| 899 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
| 900 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| 901 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
| 902 | subq $-(VEC_SIZE * 4), %rsi |
| 903 | addl $-(VEC_SIZE * 4), %edx |
| 904 | VMOVA %VEC(0), (%rdi) |
| 905 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
| 906 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
| 907 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
| 908 | subq $-(VEC_SIZE * 4), %rdi |
| 909 | cmpl $(VEC_SIZE * 4), %edx |
| 910 | ja L(loop_large_memcpy_4x_tail) |
| 911 | |
| 912 | L(large_memcpy_4x_end): |
| 913 | /* Store the last 4 * VEC. */ |
| 914 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
| 915 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
| 916 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
| 917 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
| 918 | |
| 919 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
| 920 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
| 921 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
| 922 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
| 923 | VZEROUPPER_RETURN |
| 924 | #endif |
| 925 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
| 926 | |
| 927 | #if IS_IN (libc) |
| 928 | # ifdef USE_MULTIARCH |
| 929 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |
| 930 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |
| 931 | # ifdef SHARED |
| 932 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |
| 933 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |
| 934 | # endif |
| 935 | # endif |
| 936 | # ifdef SHARED |
| 937 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |
| 938 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |
| 939 | # endif |
| 940 | #endif |
| 941 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |
| 942 | MEMCPY_SYMBOL (__memcpy, unaligned)) |
| 943 | |