| 1 | /* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. |
| 2 | Copyright (C) 2016-2022 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | #include <isa-level.h> |
| 21 | |
| 22 | #if ISA_SHOULD_BUILD (4) |
| 23 | |
| 24 | # include "asm-syntax.h" |
| 25 | |
| 26 | .section .text.avx512,"ax" ,@progbits |
| 27 | ENTRY (__mempcpy_chk_avx512_no_vzeroupper) |
| 28 | cmp %RDX_LP, %RCX_LP |
| 29 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 30 | END (__mempcpy_chk_avx512_no_vzeroupper) |
| 31 | |
| 32 | ENTRY (__mempcpy_avx512_no_vzeroupper) |
| 33 | mov %RDI_LP, %RAX_LP |
| 34 | add %RDX_LP, %RAX_LP |
| 35 | jmp L(start) |
| 36 | END (__mempcpy_avx512_no_vzeroupper) |
| 37 | |
| 38 | ENTRY (__memmove_chk_avx512_no_vzeroupper) |
| 39 | cmp %RDX_LP, %RCX_LP |
| 40 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 41 | END (__memmove_chk_avx512_no_vzeroupper) |
| 42 | |
| 43 | ENTRY (__memmove_avx512_no_vzeroupper) |
| 44 | mov %RDI_LP, %RAX_LP |
| 45 | # ifdef USE_AS_MEMPCPY |
| 46 | add %RDX_LP, %RAX_LP |
| 47 | # endif |
| 48 | L(start): |
| 49 | # ifdef __ILP32__ |
| 50 | /* Clear the upper 32 bits. */ |
| 51 | mov %edx, %edx |
| 52 | # endif |
| 53 | lea (%rsi, %rdx), %rcx |
| 54 | lea (%rdi, %rdx), %r9 |
| 55 | cmp $512, %rdx |
| 56 | ja L(512bytesormore) |
| 57 | |
| 58 | L(check): |
| 59 | cmp $16, %rdx |
| 60 | jbe L(less_16bytes) |
| 61 | cmp $256, %rdx |
| 62 | jb L(less_256bytes) |
| 63 | vmovups (%rsi), %zmm0 |
| 64 | vmovups 0x40(%rsi), %zmm1 |
| 65 | vmovups 0x80(%rsi), %zmm2 |
| 66 | vmovups 0xC0(%rsi), %zmm3 |
| 67 | vmovups -0x100(%rcx), %zmm4 |
| 68 | vmovups -0xC0(%rcx), %zmm5 |
| 69 | vmovups -0x80(%rcx), %zmm6 |
| 70 | vmovups -0x40(%rcx), %zmm7 |
| 71 | vmovups %zmm0, (%rdi) |
| 72 | vmovups %zmm1, 0x40(%rdi) |
| 73 | vmovups %zmm2, 0x80(%rdi) |
| 74 | vmovups %zmm3, 0xC0(%rdi) |
| 75 | vmovups %zmm4, -0x100(%r9) |
| 76 | vmovups %zmm5, -0xC0(%r9) |
| 77 | vmovups %zmm6, -0x80(%r9) |
| 78 | vmovups %zmm7, -0x40(%r9) |
| 79 | ret |
| 80 | |
| 81 | L(less_256bytes): |
| 82 | cmp $128, %dl |
| 83 | jb L(less_128bytes) |
| 84 | vmovups (%rsi), %zmm0 |
| 85 | vmovups 0x40(%rsi), %zmm1 |
| 86 | vmovups -0x80(%rcx), %zmm2 |
| 87 | vmovups -0x40(%rcx), %zmm3 |
| 88 | vmovups %zmm0, (%rdi) |
| 89 | vmovups %zmm1, 0x40(%rdi) |
| 90 | vmovups %zmm2, -0x80(%r9) |
| 91 | vmovups %zmm3, -0x40(%r9) |
| 92 | ret |
| 93 | |
| 94 | L(less_128bytes): |
| 95 | cmp $64, %dl |
| 96 | jb L(less_64bytes) |
| 97 | vmovdqu (%rsi), %ymm0 |
| 98 | vmovdqu 0x20(%rsi), %ymm1 |
| 99 | vmovdqu -0x40(%rcx), %ymm2 |
| 100 | vmovdqu -0x20(%rcx), %ymm3 |
| 101 | vmovdqu %ymm0, (%rdi) |
| 102 | vmovdqu %ymm1, 0x20(%rdi) |
| 103 | vmovdqu %ymm2, -0x40(%r9) |
| 104 | vmovdqu %ymm3, -0x20(%r9) |
| 105 | ret |
| 106 | |
| 107 | L(less_64bytes): |
| 108 | cmp $32, %dl |
| 109 | jb L(less_32bytes) |
| 110 | vmovdqu (%rsi), %ymm0 |
| 111 | vmovdqu -0x20(%rcx), %ymm1 |
| 112 | vmovdqu %ymm0, (%rdi) |
| 113 | vmovdqu %ymm1, -0x20(%r9) |
| 114 | ret |
| 115 | |
| 116 | L(less_32bytes): |
| 117 | vmovdqu (%rsi), %xmm0 |
| 118 | vmovdqu -0x10(%rcx), %xmm1 |
| 119 | vmovdqu %xmm0, (%rdi) |
| 120 | vmovdqu %xmm1, -0x10(%r9) |
| 121 | ret |
| 122 | |
| 123 | L(less_16bytes): |
| 124 | cmp $8, %dl |
| 125 | jb L(less_8bytes) |
| 126 | movq (%rsi), %rsi |
| 127 | movq -0x8(%rcx), %rcx |
| 128 | movq %rsi, (%rdi) |
| 129 | movq %rcx, -0x8(%r9) |
| 130 | ret |
| 131 | |
| 132 | L(less_8bytes): |
| 133 | cmp $4, %dl |
| 134 | jb L(less_4bytes) |
| 135 | mov (%rsi), %esi |
| 136 | mov -0x4(%rcx), %ecx |
| 137 | mov %esi, (%rdi) |
| 138 | mov %ecx, -0x4(%r9) |
| 139 | ret |
| 140 | |
| 141 | L(less_4bytes): |
| 142 | cmp $2, %dl |
| 143 | jb L(less_2bytes) |
| 144 | mov (%rsi), %si |
| 145 | mov -0x2(%rcx), %cx |
| 146 | mov %si, (%rdi) |
| 147 | mov %cx, -0x2(%r9) |
| 148 | ret |
| 149 | |
| 150 | L(less_2bytes): |
| 151 | cmp $1, %dl |
| 152 | jb L(less_1bytes) |
| 153 | mov (%rsi), %cl |
| 154 | mov %cl, (%rdi) |
| 155 | L(less_1bytes): |
| 156 | ret |
| 157 | |
| 158 | L(512bytesormore): |
| 159 | # ifdef SHARED_CACHE_SIZE_HALF |
| 160 | mov $SHARED_CACHE_SIZE_HALF, %r8 |
| 161 | # else |
| 162 | mov __x86_shared_cache_size_half(%rip), %r8 |
| 163 | # endif |
| 164 | cmp %r8, %rdx |
| 165 | jae L(preloop_large) |
| 166 | cmp $1024, %rdx |
| 167 | ja L(1024bytesormore) |
| 168 | prefetcht1 (%rsi) |
| 169 | prefetcht1 0x40(%rsi) |
| 170 | prefetcht1 0x80(%rsi) |
| 171 | prefetcht1 0xC0(%rsi) |
| 172 | prefetcht1 0x100(%rsi) |
| 173 | prefetcht1 0x140(%rsi) |
| 174 | prefetcht1 0x180(%rsi) |
| 175 | prefetcht1 0x1C0(%rsi) |
| 176 | prefetcht1 -0x200(%rcx) |
| 177 | prefetcht1 -0x1C0(%rcx) |
| 178 | prefetcht1 -0x180(%rcx) |
| 179 | prefetcht1 -0x140(%rcx) |
| 180 | prefetcht1 -0x100(%rcx) |
| 181 | prefetcht1 -0xC0(%rcx) |
| 182 | prefetcht1 -0x80(%rcx) |
| 183 | prefetcht1 -0x40(%rcx) |
| 184 | vmovups (%rsi), %zmm0 |
| 185 | vmovups 0x40(%rsi), %zmm1 |
| 186 | vmovups 0x80(%rsi), %zmm2 |
| 187 | vmovups 0xC0(%rsi), %zmm3 |
| 188 | vmovups 0x100(%rsi), %zmm4 |
| 189 | vmovups 0x140(%rsi), %zmm5 |
| 190 | vmovups 0x180(%rsi), %zmm6 |
| 191 | vmovups 0x1C0(%rsi), %zmm7 |
| 192 | vmovups -0x200(%rcx), %zmm8 |
| 193 | vmovups -0x1C0(%rcx), %zmm9 |
| 194 | vmovups -0x180(%rcx), %zmm10 |
| 195 | vmovups -0x140(%rcx), %zmm11 |
| 196 | vmovups -0x100(%rcx), %zmm12 |
| 197 | vmovups -0xC0(%rcx), %zmm13 |
| 198 | vmovups -0x80(%rcx), %zmm14 |
| 199 | vmovups -0x40(%rcx), %zmm15 |
| 200 | vmovups %zmm0, (%rdi) |
| 201 | vmovups %zmm1, 0x40(%rdi) |
| 202 | vmovups %zmm2, 0x80(%rdi) |
| 203 | vmovups %zmm3, 0xC0(%rdi) |
| 204 | vmovups %zmm4, 0x100(%rdi) |
| 205 | vmovups %zmm5, 0x140(%rdi) |
| 206 | vmovups %zmm6, 0x180(%rdi) |
| 207 | vmovups %zmm7, 0x1C0(%rdi) |
| 208 | vmovups %zmm8, -0x200(%r9) |
| 209 | vmovups %zmm9, -0x1C0(%r9) |
| 210 | vmovups %zmm10, -0x180(%r9) |
| 211 | vmovups %zmm11, -0x140(%r9) |
| 212 | vmovups %zmm12, -0x100(%r9) |
| 213 | vmovups %zmm13, -0xC0(%r9) |
| 214 | vmovups %zmm14, -0x80(%r9) |
| 215 | vmovups %zmm15, -0x40(%r9) |
| 216 | ret |
| 217 | |
| 218 | L(1024bytesormore): |
| 219 | cmp %rsi, %rdi |
| 220 | ja L(1024bytesormore_bkw) |
| 221 | sub $512, %r9 |
| 222 | vmovups -0x200(%rcx), %zmm8 |
| 223 | vmovups -0x1C0(%rcx), %zmm9 |
| 224 | vmovups -0x180(%rcx), %zmm10 |
| 225 | vmovups -0x140(%rcx), %zmm11 |
| 226 | vmovups -0x100(%rcx), %zmm12 |
| 227 | vmovups -0xC0(%rcx), %zmm13 |
| 228 | vmovups -0x80(%rcx), %zmm14 |
| 229 | vmovups -0x40(%rcx), %zmm15 |
| 230 | prefetcht1 (%rsi) |
| 231 | prefetcht1 0x40(%rsi) |
| 232 | prefetcht1 0x80(%rsi) |
| 233 | prefetcht1 0xC0(%rsi) |
| 234 | prefetcht1 0x100(%rsi) |
| 235 | prefetcht1 0x140(%rsi) |
| 236 | prefetcht1 0x180(%rsi) |
| 237 | prefetcht1 0x1C0(%rsi) |
| 238 | |
| 239 | /* Loop with unaligned memory access. */ |
| 240 | L(gobble_512bytes_loop): |
| 241 | vmovups (%rsi), %zmm0 |
| 242 | vmovups 0x40(%rsi), %zmm1 |
| 243 | vmovups 0x80(%rsi), %zmm2 |
| 244 | vmovups 0xC0(%rsi), %zmm3 |
| 245 | vmovups 0x100(%rsi), %zmm4 |
| 246 | vmovups 0x140(%rsi), %zmm5 |
| 247 | vmovups 0x180(%rsi), %zmm6 |
| 248 | vmovups 0x1C0(%rsi), %zmm7 |
| 249 | add $512, %rsi |
| 250 | prefetcht1 (%rsi) |
| 251 | prefetcht1 0x40(%rsi) |
| 252 | prefetcht1 0x80(%rsi) |
| 253 | prefetcht1 0xC0(%rsi) |
| 254 | prefetcht1 0x100(%rsi) |
| 255 | prefetcht1 0x140(%rsi) |
| 256 | prefetcht1 0x180(%rsi) |
| 257 | prefetcht1 0x1C0(%rsi) |
| 258 | vmovups %zmm0, (%rdi) |
| 259 | vmovups %zmm1, 0x40(%rdi) |
| 260 | vmovups %zmm2, 0x80(%rdi) |
| 261 | vmovups %zmm3, 0xC0(%rdi) |
| 262 | vmovups %zmm4, 0x100(%rdi) |
| 263 | vmovups %zmm5, 0x140(%rdi) |
| 264 | vmovups %zmm6, 0x180(%rdi) |
| 265 | vmovups %zmm7, 0x1C0(%rdi) |
| 266 | add $512, %rdi |
| 267 | cmp %r9, %rdi |
| 268 | jb L(gobble_512bytes_loop) |
| 269 | vmovups %zmm8, (%r9) |
| 270 | vmovups %zmm9, 0x40(%r9) |
| 271 | vmovups %zmm10, 0x80(%r9) |
| 272 | vmovups %zmm11, 0xC0(%r9) |
| 273 | vmovups %zmm12, 0x100(%r9) |
| 274 | vmovups %zmm13, 0x140(%r9) |
| 275 | vmovups %zmm14, 0x180(%r9) |
| 276 | vmovups %zmm15, 0x1C0(%r9) |
| 277 | ret |
| 278 | |
| 279 | L(1024bytesormore_bkw): |
| 280 | add $512, %rdi |
| 281 | vmovups 0x1C0(%rsi), %zmm8 |
| 282 | vmovups 0x180(%rsi), %zmm9 |
| 283 | vmovups 0x140(%rsi), %zmm10 |
| 284 | vmovups 0x100(%rsi), %zmm11 |
| 285 | vmovups 0xC0(%rsi), %zmm12 |
| 286 | vmovups 0x80(%rsi), %zmm13 |
| 287 | vmovups 0x40(%rsi), %zmm14 |
| 288 | vmovups (%rsi), %zmm15 |
| 289 | prefetcht1 -0x40(%rcx) |
| 290 | prefetcht1 -0x80(%rcx) |
| 291 | prefetcht1 -0xC0(%rcx) |
| 292 | prefetcht1 -0x100(%rcx) |
| 293 | prefetcht1 -0x140(%rcx) |
| 294 | prefetcht1 -0x180(%rcx) |
| 295 | prefetcht1 -0x1C0(%rcx) |
| 296 | prefetcht1 -0x200(%rcx) |
| 297 | |
| 298 | /* Backward loop with unaligned memory access. */ |
| 299 | L(gobble_512bytes_loop_bkw): |
| 300 | vmovups -0x40(%rcx), %zmm0 |
| 301 | vmovups -0x80(%rcx), %zmm1 |
| 302 | vmovups -0xC0(%rcx), %zmm2 |
| 303 | vmovups -0x100(%rcx), %zmm3 |
| 304 | vmovups -0x140(%rcx), %zmm4 |
| 305 | vmovups -0x180(%rcx), %zmm5 |
| 306 | vmovups -0x1C0(%rcx), %zmm6 |
| 307 | vmovups -0x200(%rcx), %zmm7 |
| 308 | sub $512, %rcx |
| 309 | prefetcht1 -0x40(%rcx) |
| 310 | prefetcht1 -0x80(%rcx) |
| 311 | prefetcht1 -0xC0(%rcx) |
| 312 | prefetcht1 -0x100(%rcx) |
| 313 | prefetcht1 -0x140(%rcx) |
| 314 | prefetcht1 -0x180(%rcx) |
| 315 | prefetcht1 -0x1C0(%rcx) |
| 316 | prefetcht1 -0x200(%rcx) |
| 317 | vmovups %zmm0, -0x40(%r9) |
| 318 | vmovups %zmm1, -0x80(%r9) |
| 319 | vmovups %zmm2, -0xC0(%r9) |
| 320 | vmovups %zmm3, -0x100(%r9) |
| 321 | vmovups %zmm4, -0x140(%r9) |
| 322 | vmovups %zmm5, -0x180(%r9) |
| 323 | vmovups %zmm6, -0x1C0(%r9) |
| 324 | vmovups %zmm7, -0x200(%r9) |
| 325 | sub $512, %r9 |
| 326 | cmp %rdi, %r9 |
| 327 | ja L(gobble_512bytes_loop_bkw) |
| 328 | vmovups %zmm8, -0x40(%rdi) |
| 329 | vmovups %zmm9, -0x80(%rdi) |
| 330 | vmovups %zmm10, -0xC0(%rdi) |
| 331 | vmovups %zmm11, -0x100(%rdi) |
| 332 | vmovups %zmm12, -0x140(%rdi) |
| 333 | vmovups %zmm13, -0x180(%rdi) |
| 334 | vmovups %zmm14, -0x1C0(%rdi) |
| 335 | vmovups %zmm15, -0x200(%rdi) |
| 336 | ret |
| 337 | |
| 338 | L(preloop_large): |
| 339 | cmp %rsi, %rdi |
| 340 | ja L(preloop_large_bkw) |
| 341 | vmovups (%rsi), %zmm4 |
| 342 | vmovups 0x40(%rsi), %zmm5 |
| 343 | |
| 344 | mov %rdi, %r11 |
| 345 | /* Align destination for access with non-temporal stores in the loop. */ |
| 346 | mov %rdi, %r8 |
| 347 | and $-0x80, %rdi |
| 348 | add $0x80, %rdi |
| 349 | sub %rdi, %r8 |
| 350 | sub %r8, %rsi |
| 351 | add %r8, %rdx |
| 352 | L(gobble_256bytes_nt_loop): |
| 353 | prefetcht1 0x200(%rsi) |
| 354 | prefetcht1 0x240(%rsi) |
| 355 | prefetcht1 0x280(%rsi) |
| 356 | prefetcht1 0x2C0(%rsi) |
| 357 | prefetcht1 0x300(%rsi) |
| 358 | prefetcht1 0x340(%rsi) |
| 359 | prefetcht1 0x380(%rsi) |
| 360 | prefetcht1 0x3C0(%rsi) |
| 361 | vmovdqu64 (%rsi), %zmm0 |
| 362 | vmovdqu64 0x40(%rsi), %zmm1 |
| 363 | vmovdqu64 0x80(%rsi), %zmm2 |
| 364 | vmovdqu64 0xC0(%rsi), %zmm3 |
| 365 | vmovntdq %zmm0, (%rdi) |
| 366 | vmovntdq %zmm1, 0x40(%rdi) |
| 367 | vmovntdq %zmm2, 0x80(%rdi) |
| 368 | vmovntdq %zmm3, 0xC0(%rdi) |
| 369 | sub $256, %rdx |
| 370 | add $256, %rsi |
| 371 | add $256, %rdi |
| 372 | cmp $256, %rdx |
| 373 | ja L(gobble_256bytes_nt_loop) |
| 374 | sfence |
| 375 | vmovups %zmm4, (%r11) |
| 376 | vmovups %zmm5, 0x40(%r11) |
| 377 | jmp L(check) |
| 378 | |
| 379 | L(preloop_large_bkw): |
| 380 | vmovups -0x80(%rcx), %zmm4 |
| 381 | vmovups -0x40(%rcx), %zmm5 |
| 382 | |
| 383 | /* Align end of destination for access with non-temporal stores. */ |
| 384 | mov %r9, %r8 |
| 385 | and $-0x80, %r9 |
| 386 | sub %r9, %r8 |
| 387 | sub %r8, %rcx |
| 388 | sub %r8, %rdx |
| 389 | add %r9, %r8 |
| 390 | L(gobble_256bytes_nt_loop_bkw): |
| 391 | prefetcht1 -0x400(%rcx) |
| 392 | prefetcht1 -0x3C0(%rcx) |
| 393 | prefetcht1 -0x380(%rcx) |
| 394 | prefetcht1 -0x340(%rcx) |
| 395 | prefetcht1 -0x300(%rcx) |
| 396 | prefetcht1 -0x2C0(%rcx) |
| 397 | prefetcht1 -0x280(%rcx) |
| 398 | prefetcht1 -0x240(%rcx) |
| 399 | vmovdqu64 -0x100(%rcx), %zmm0 |
| 400 | vmovdqu64 -0xC0(%rcx), %zmm1 |
| 401 | vmovdqu64 -0x80(%rcx), %zmm2 |
| 402 | vmovdqu64 -0x40(%rcx), %zmm3 |
| 403 | vmovntdq %zmm0, -0x100(%r9) |
| 404 | vmovntdq %zmm1, -0xC0(%r9) |
| 405 | vmovntdq %zmm2, -0x80(%r9) |
| 406 | vmovntdq %zmm3, -0x40(%r9) |
| 407 | sub $256, %rdx |
| 408 | sub $256, %rcx |
| 409 | sub $256, %r9 |
| 410 | cmp $256, %rdx |
| 411 | ja L(gobble_256bytes_nt_loop_bkw) |
| 412 | sfence |
| 413 | vmovups %zmm4, -0x80(%r8) |
| 414 | vmovups %zmm5, -0x40(%r8) |
| 415 | jmp L(check) |
| 416 | END (__memmove_avx512_no_vzeroupper) |
| 417 | |
| 418 | strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) |
| 419 | strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) |
| 420 | #endif |
| 421 | |