| 1 | /* memcpy with AVX |
| 2 | Copyright (C) 2014-2016 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <http://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | #if IS_IN (libc) \ |
| 22 | && (defined SHARED \ |
| 23 | || defined USE_AS_MEMMOVE \ |
| 24 | || !defined USE_MULTIARCH) |
| 25 | |
| 26 | #include "asm-syntax.h" |
| 27 | #ifndef MEMCPY |
| 28 | # define MEMCPY __memcpy_avx_unaligned |
| 29 | # define MEMCPY_CHK __memcpy_chk_avx_unaligned |
| 30 | #endif |
| 31 | |
| 32 | .section .text.avx,"ax" ,@progbits |
| 33 | #if !defined USE_AS_BCOPY |
| 34 | ENTRY (MEMCPY_CHK) |
| 35 | cmpq %rdx, %rcx |
| 36 | jb HIDDEN_JUMPTARGET (__chk_fail) |
| 37 | END (MEMCPY_CHK) |
| 38 | #endif |
| 39 | |
| 40 | ENTRY (MEMCPY) |
| 41 | mov %rdi, %rax |
| 42 | #ifdef USE_AS_MEMPCPY |
| 43 | add %rdx, %rax |
| 44 | #endif |
| 45 | cmp $256, %rdx |
| 46 | jae L(256bytesormore) |
| 47 | cmp $16, %dl |
| 48 | jb L(less_16bytes) |
| 49 | cmp $128, %dl |
| 50 | jb L(less_128bytes) |
| 51 | vmovdqu (%rsi), %xmm0 |
| 52 | lea (%rsi, %rdx), %rcx |
| 53 | vmovdqu 0x10(%rsi), %xmm1 |
| 54 | vmovdqu 0x20(%rsi), %xmm2 |
| 55 | vmovdqu 0x30(%rsi), %xmm3 |
| 56 | vmovdqu 0x40(%rsi), %xmm4 |
| 57 | vmovdqu 0x50(%rsi), %xmm5 |
| 58 | vmovdqu 0x60(%rsi), %xmm6 |
| 59 | vmovdqu 0x70(%rsi), %xmm7 |
| 60 | vmovdqu -0x80(%rcx), %xmm8 |
| 61 | vmovdqu -0x70(%rcx), %xmm9 |
| 62 | vmovdqu -0x60(%rcx), %xmm10 |
| 63 | vmovdqu -0x50(%rcx), %xmm11 |
| 64 | vmovdqu -0x40(%rcx), %xmm12 |
| 65 | vmovdqu -0x30(%rcx), %xmm13 |
| 66 | vmovdqu -0x20(%rcx), %xmm14 |
| 67 | vmovdqu -0x10(%rcx), %xmm15 |
| 68 | lea (%rdi, %rdx), %rdx |
| 69 | vmovdqu %xmm0, (%rdi) |
| 70 | vmovdqu %xmm1, 0x10(%rdi) |
| 71 | vmovdqu %xmm2, 0x20(%rdi) |
| 72 | vmovdqu %xmm3, 0x30(%rdi) |
| 73 | vmovdqu %xmm4, 0x40(%rdi) |
| 74 | vmovdqu %xmm5, 0x50(%rdi) |
| 75 | vmovdqu %xmm6, 0x60(%rdi) |
| 76 | vmovdqu %xmm7, 0x70(%rdi) |
| 77 | vmovdqu %xmm8, -0x80(%rdx) |
| 78 | vmovdqu %xmm9, -0x70(%rdx) |
| 79 | vmovdqu %xmm10, -0x60(%rdx) |
| 80 | vmovdqu %xmm11, -0x50(%rdx) |
| 81 | vmovdqu %xmm12, -0x40(%rdx) |
| 82 | vmovdqu %xmm13, -0x30(%rdx) |
| 83 | vmovdqu %xmm14, -0x20(%rdx) |
| 84 | vmovdqu %xmm15, -0x10(%rdx) |
| 85 | ret |
| 86 | .p2align 4 |
| 87 | L(less_128bytes): |
| 88 | cmp $64, %dl |
| 89 | jb L(less_64bytes) |
| 90 | vmovdqu (%rsi), %xmm0 |
| 91 | lea (%rsi, %rdx), %rcx |
| 92 | vmovdqu 0x10(%rsi), %xmm1 |
| 93 | vmovdqu 0x20(%rsi), %xmm2 |
| 94 | lea (%rdi, %rdx), %rdx |
| 95 | vmovdqu 0x30(%rsi), %xmm3 |
| 96 | vmovdqu -0x40(%rcx), %xmm4 |
| 97 | vmovdqu -0x30(%rcx), %xmm5 |
| 98 | vmovdqu -0x20(%rcx), %xmm6 |
| 99 | vmovdqu -0x10(%rcx), %xmm7 |
| 100 | vmovdqu %xmm0, (%rdi) |
| 101 | vmovdqu %xmm1, 0x10(%rdi) |
| 102 | vmovdqu %xmm2, 0x20(%rdi) |
| 103 | vmovdqu %xmm3, 0x30(%rdi) |
| 104 | vmovdqu %xmm4, -0x40(%rdx) |
| 105 | vmovdqu %xmm5, -0x30(%rdx) |
| 106 | vmovdqu %xmm6, -0x20(%rdx) |
| 107 | vmovdqu %xmm7, -0x10(%rdx) |
| 108 | ret |
| 109 | |
| 110 | .p2align 4 |
| 111 | L(less_64bytes): |
| 112 | cmp $32, %dl |
| 113 | jb L(less_32bytes) |
| 114 | vmovdqu (%rsi), %xmm0 |
| 115 | vmovdqu 0x10(%rsi), %xmm1 |
| 116 | vmovdqu -0x20(%rsi, %rdx), %xmm6 |
| 117 | vmovdqu -0x10(%rsi, %rdx), %xmm7 |
| 118 | vmovdqu %xmm0, (%rdi) |
| 119 | vmovdqu %xmm1, 0x10(%rdi) |
| 120 | vmovdqu %xmm6, -0x20(%rdi, %rdx) |
| 121 | vmovdqu %xmm7, -0x10(%rdi, %rdx) |
| 122 | ret |
| 123 | |
| 124 | .p2align 4 |
| 125 | L(less_32bytes): |
| 126 | vmovdqu (%rsi), %xmm0 |
| 127 | vmovdqu -0x10(%rsi, %rdx), %xmm7 |
| 128 | vmovdqu %xmm0, (%rdi) |
| 129 | vmovdqu %xmm7, -0x10(%rdi, %rdx) |
| 130 | ret |
| 131 | |
| 132 | .p2align 4 |
| 133 | L(less_16bytes): |
| 134 | cmp $8, %dl |
| 135 | jb L(less_8bytes) |
| 136 | movq -0x08(%rsi, %rdx), %rcx |
| 137 | movq (%rsi), %rsi |
| 138 | movq %rsi, (%rdi) |
| 139 | movq %rcx, -0x08(%rdi, %rdx) |
| 140 | ret |
| 141 | |
| 142 | .p2align 4 |
| 143 | L(less_8bytes): |
| 144 | cmp $4, %dl |
| 145 | jb L(less_4bytes) |
| 146 | mov -0x04(%rsi, %rdx), %ecx |
| 147 | mov (%rsi), %esi |
| 148 | mov %esi, (%rdi) |
| 149 | mov %ecx, -0x04(%rdi, %rdx) |
| 150 | ret |
| 151 | |
| 152 | L(less_4bytes): |
| 153 | cmp $1, %dl |
| 154 | jbe L(less_2bytes) |
| 155 | mov -0x02(%rsi, %rdx), %cx |
| 156 | mov (%rsi), %si |
| 157 | mov %si, (%rdi) |
| 158 | mov %cx, -0x02(%rdi, %rdx) |
| 159 | ret |
| 160 | |
| 161 | L(less_2bytes): |
| 162 | jb L(less_0bytes) |
| 163 | mov (%rsi), %cl |
| 164 | mov %cl, (%rdi) |
| 165 | L(less_0bytes): |
| 166 | ret |
| 167 | |
| 168 | .p2align 4 |
| 169 | L(256bytesormore): |
| 170 | #ifdef USE_AS_MEMMOVE |
| 171 | mov %rdi, %rcx |
| 172 | sub %rsi, %rcx |
| 173 | cmp %rdx, %rcx |
| 174 | jc L(copy_backward) |
| 175 | #endif |
| 176 | cmp $2048, %rdx |
| 177 | jae L(gobble_data_movsb) |
| 178 | mov %rax, %r8 |
| 179 | lea (%rsi, %rdx), %rcx |
| 180 | mov %rdi, %r10 |
| 181 | vmovdqu -0x80(%rcx), %xmm5 |
| 182 | vmovdqu -0x70(%rcx), %xmm6 |
| 183 | mov $0x80, %rax |
| 184 | and $-32, %rdi |
| 185 | add $32, %rdi |
| 186 | vmovdqu -0x60(%rcx), %xmm7 |
| 187 | vmovdqu -0x50(%rcx), %xmm8 |
| 188 | mov %rdi, %r11 |
| 189 | sub %r10, %r11 |
| 190 | vmovdqu -0x40(%rcx), %xmm9 |
| 191 | vmovdqu -0x30(%rcx), %xmm10 |
| 192 | sub %r11, %rdx |
| 193 | vmovdqu -0x20(%rcx), %xmm11 |
| 194 | vmovdqu -0x10(%rcx), %xmm12 |
| 195 | vmovdqu (%rsi), %ymm4 |
| 196 | add %r11, %rsi |
| 197 | sub %eax, %edx |
| 198 | L(goble_128_loop): |
| 199 | vmovdqu (%rsi), %ymm0 |
| 200 | vmovdqu 0x20(%rsi), %ymm1 |
| 201 | vmovdqu 0x40(%rsi), %ymm2 |
| 202 | vmovdqu 0x60(%rsi), %ymm3 |
| 203 | add %rax, %rsi |
| 204 | vmovdqa %ymm0, (%rdi) |
| 205 | vmovdqa %ymm1, 0x20(%rdi) |
| 206 | vmovdqa %ymm2, 0x40(%rdi) |
| 207 | vmovdqa %ymm3, 0x60(%rdi) |
| 208 | add %rax, %rdi |
| 209 | sub %eax, %edx |
| 210 | jae L(goble_128_loop) |
| 211 | add %eax, %edx |
| 212 | add %rdi, %rdx |
| 213 | vmovdqu %ymm4, (%r10) |
| 214 | vzeroupper |
| 215 | vmovdqu %xmm5, -0x80(%rdx) |
| 216 | vmovdqu %xmm6, -0x70(%rdx) |
| 217 | vmovdqu %xmm7, -0x60(%rdx) |
| 218 | vmovdqu %xmm8, -0x50(%rdx) |
| 219 | vmovdqu %xmm9, -0x40(%rdx) |
| 220 | vmovdqu %xmm10, -0x30(%rdx) |
| 221 | vmovdqu %xmm11, -0x20(%rdx) |
| 222 | vmovdqu %xmm12, -0x10(%rdx) |
| 223 | mov %r8, %rax |
| 224 | ret |
| 225 | |
| 226 | .p2align 4 |
| 227 | L(gobble_data_movsb): |
| 228 | #ifdef SHARED_CACHE_SIZE_HALF |
| 229 | mov $SHARED_CACHE_SIZE_HALF, %rcx |
| 230 | #else |
| 231 | mov __x86_shared_cache_size_half(%rip), %rcx |
| 232 | #endif |
| 233 | shl $3, %rcx |
| 234 | cmp %rcx, %rdx |
| 235 | jae L(gobble_big_data_fwd) |
| 236 | mov %rdx, %rcx |
| 237 | mov %rdx, %rcx |
| 238 | rep movsb |
| 239 | ret |
| 240 | |
| 241 | .p2align 4 |
| 242 | L(gobble_big_data_fwd): |
| 243 | lea (%rsi, %rdx), %rcx |
| 244 | vmovdqu (%rsi), %ymm4 |
| 245 | vmovdqu -0x80(%rsi,%rdx), %xmm5 |
| 246 | vmovdqu -0x70(%rcx), %xmm6 |
| 247 | vmovdqu -0x60(%rcx), %xmm7 |
| 248 | vmovdqu -0x50(%rcx), %xmm8 |
| 249 | vmovdqu -0x40(%rcx), %xmm9 |
| 250 | vmovdqu -0x30(%rcx), %xmm10 |
| 251 | vmovdqu -0x20(%rcx), %xmm11 |
| 252 | vmovdqu -0x10(%rcx), %xmm12 |
| 253 | mov %rdi, %r8 |
| 254 | and $-32, %rdi |
| 255 | add $32, %rdi |
| 256 | mov %rdi, %r10 |
| 257 | sub %r8, %r10 |
| 258 | sub %r10, %rdx |
| 259 | add %r10, %rsi |
| 260 | lea (%rdi, %rdx), %rcx |
| 261 | add $-0x80, %rdx |
| 262 | L(gobble_mem_fwd_loop): |
| 263 | prefetchnta 0x1c0(%rsi) |
| 264 | prefetchnta 0x280(%rsi) |
| 265 | vmovdqu (%rsi), %ymm0 |
| 266 | vmovdqu 0x20(%rsi), %ymm1 |
| 267 | vmovdqu 0x40(%rsi), %ymm2 |
| 268 | vmovdqu 0x60(%rsi), %ymm3 |
| 269 | sub $-0x80, %rsi |
| 270 | vmovntdq %ymm0, (%rdi) |
| 271 | vmovntdq %ymm1, 0x20(%rdi) |
| 272 | vmovntdq %ymm2, 0x40(%rdi) |
| 273 | vmovntdq %ymm3, 0x60(%rdi) |
| 274 | sub $-0x80, %rdi |
| 275 | add $-0x80, %rdx |
| 276 | jb L(gobble_mem_fwd_loop) |
| 277 | sfence |
| 278 | vmovdqu %ymm4, (%r8) |
| 279 | vzeroupper |
| 280 | vmovdqu %xmm5, -0x80(%rcx) |
| 281 | vmovdqu %xmm6, -0x70(%rcx) |
| 282 | vmovdqu %xmm7, -0x60(%rcx) |
| 283 | vmovdqu %xmm8, -0x50(%rcx) |
| 284 | vmovdqu %xmm9, -0x40(%rcx) |
| 285 | vmovdqu %xmm10, -0x30(%rcx) |
| 286 | vmovdqu %xmm11, -0x20(%rcx) |
| 287 | vmovdqu %xmm12, -0x10(%rcx) |
| 288 | ret |
| 289 | |
| 290 | #ifdef USE_AS_MEMMOVE |
| 291 | .p2align 4 |
| 292 | L(copy_backward): |
| 293 | #ifdef SHARED_CACHE_SIZE_HALF |
| 294 | mov $SHARED_CACHE_SIZE_HALF, %rcx |
| 295 | #else |
| 296 | mov __x86_shared_cache_size_half(%rip), %rcx |
| 297 | #endif |
| 298 | shl $3, %rcx |
| 299 | vmovdqu (%rsi), %xmm5 |
| 300 | vmovdqu 0x10(%rsi), %xmm6 |
| 301 | add %rdx, %rdi |
| 302 | vmovdqu 0x20(%rsi), %xmm7 |
| 303 | vmovdqu 0x30(%rsi), %xmm8 |
| 304 | lea -0x20(%rdi), %r10 |
| 305 | mov %rdi, %r11 |
| 306 | vmovdqu 0x40(%rsi), %xmm9 |
| 307 | vmovdqu 0x50(%rsi), %xmm10 |
| 308 | and $0x1f, %r11 |
| 309 | vmovdqu 0x60(%rsi), %xmm11 |
| 310 | vmovdqu 0x70(%rsi), %xmm12 |
| 311 | xor %r11, %rdi |
| 312 | add %rdx, %rsi |
| 313 | vmovdqu -0x20(%rsi), %ymm4 |
| 314 | sub %r11, %rsi |
| 315 | sub %r11, %rdx |
| 316 | cmp %rcx, %rdx |
| 317 | ja L(gobble_big_data_bwd) |
| 318 | add $-0x80, %rdx |
| 319 | L(gobble_mem_bwd_llc): |
| 320 | vmovdqu -0x20(%rsi), %ymm0 |
| 321 | vmovdqu -0x40(%rsi), %ymm1 |
| 322 | vmovdqu -0x60(%rsi), %ymm2 |
| 323 | vmovdqu -0x80(%rsi), %ymm3 |
| 324 | lea -0x80(%rsi), %rsi |
| 325 | vmovdqa %ymm0, -0x20(%rdi) |
| 326 | vmovdqa %ymm1, -0x40(%rdi) |
| 327 | vmovdqa %ymm2, -0x60(%rdi) |
| 328 | vmovdqa %ymm3, -0x80(%rdi) |
| 329 | lea -0x80(%rdi), %rdi |
| 330 | add $-0x80, %rdx |
| 331 | jb L(gobble_mem_bwd_llc) |
| 332 | vmovdqu %ymm4, (%r10) |
| 333 | vzeroupper |
| 334 | vmovdqu %xmm5, (%rax) |
| 335 | vmovdqu %xmm6, 0x10(%rax) |
| 336 | vmovdqu %xmm7, 0x20(%rax) |
| 337 | vmovdqu %xmm8, 0x30(%rax) |
| 338 | vmovdqu %xmm9, 0x40(%rax) |
| 339 | vmovdqu %xmm10, 0x50(%rax) |
| 340 | vmovdqu %xmm11, 0x60(%rax) |
| 341 | vmovdqu %xmm12, 0x70(%rax) |
| 342 | ret |
| 343 | |
| 344 | .p2align 4 |
| 345 | L(gobble_big_data_bwd): |
| 346 | add $-0x80, %rdx |
| 347 | L(gobble_mem_bwd_loop): |
| 348 | prefetchnta -0x1c0(%rsi) |
| 349 | prefetchnta -0x280(%rsi) |
| 350 | vmovdqu -0x20(%rsi), %ymm0 |
| 351 | vmovdqu -0x40(%rsi), %ymm1 |
| 352 | vmovdqu -0x60(%rsi), %ymm2 |
| 353 | vmovdqu -0x80(%rsi), %ymm3 |
| 354 | lea -0x80(%rsi), %rsi |
| 355 | vmovntdq %ymm0, -0x20(%rdi) |
| 356 | vmovntdq %ymm1, -0x40(%rdi) |
| 357 | vmovntdq %ymm2, -0x60(%rdi) |
| 358 | vmovntdq %ymm3, -0x80(%rdi) |
| 359 | lea -0x80(%rdi), %rdi |
| 360 | add $-0x80, %rdx |
| 361 | jb L(gobble_mem_bwd_loop) |
| 362 | sfence |
| 363 | vmovdqu %ymm4, (%r10) |
| 364 | vzeroupper |
| 365 | vmovdqu %xmm5, (%rax) |
| 366 | vmovdqu %xmm6, 0x10(%rax) |
| 367 | vmovdqu %xmm7, 0x20(%rax) |
| 368 | vmovdqu %xmm8, 0x30(%rax) |
| 369 | vmovdqu %xmm9, 0x40(%rax) |
| 370 | vmovdqu %xmm10, 0x50(%rax) |
| 371 | vmovdqu %xmm11, 0x60(%rax) |
| 372 | vmovdqu %xmm12, 0x70(%rax) |
| 373 | ret |
| 374 | #endif |
| 375 | END (MEMCPY) |
| 376 | #endif |
| 377 | |