1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
2 | Copyright (C) 2016-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* memmove/memcpy/mempcpy is implemented as: |
20 | 1. Use overlapping load and store to avoid branch. |
21 | 2. Load all sources into registers and store them together to avoid |
22 | possible address overlap between source and destination. |
23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |
24 | and store them together. |
25 | 4. If address of destination > address of source, backward copy |
26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |
27 | Load the first 4 * VEC and last VEC before the loop and store |
28 | them after the loop to support overlapping addresses. |
29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |
30 | load and aligned store. Load the last 4 * VEC and first VEC |
31 | before the loop and store them after the loop to support |
32 | overlapping addresses. |
33 | 6. On machines with ERMS feature, if size greater than equal or to |
34 | __x86_rep_movsb_threshold and less than |
35 | __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. |
36 | 7. If size >= __x86_shared_non_temporal_threshold and there is no |
37 | overlap between destination and source, use non-temporal store |
38 | instead of aligned store copying from either 2 or 4 pages at |
39 | once. |
40 | 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold |
41 | and source and destination do not page alias, copy from 2 pages |
42 | at once using non-temporal stores. Page aliasing in this case is |
43 | considered true if destination's page alignment - sources' page |
44 | alignment is less than 8 * VEC_SIZE. |
45 | 9. If size >= 16 * __x86_shared_non_temporal_threshold or source |
46 | and destination do page alias copy from 4 pages at once using |
47 | non-temporal stores. */ |
48 | |
49 | #include <sysdep.h> |
50 | |
51 | #ifndef MEMCPY_SYMBOL |
52 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
53 | #endif |
54 | |
55 | #ifndef MEMPCPY_SYMBOL |
56 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
57 | #endif |
58 | |
59 | #ifndef MEMMOVE_CHK_SYMBOL |
60 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
61 | #endif |
62 | |
63 | #ifndef XMM0 |
64 | # define XMM0 xmm0 |
65 | #endif |
66 | |
67 | #ifndef YMM0 |
68 | # define YMM0 ymm0 |
69 | #endif |
70 | |
71 | #ifndef VZEROUPPER |
72 | # if VEC_SIZE > 16 |
73 | # define VZEROUPPER vzeroupper |
74 | # else |
75 | # define VZEROUPPER |
76 | # endif |
77 | #endif |
78 | |
79 | /* Whether to align before movsb. Ultimately we want 64 byte |
80 | align and not worth it to load 4x VEC for VEC_SIZE == 16. */ |
81 | #define ALIGN_MOVSB (VEC_SIZE > 16) |
82 | /* Number of bytes to align movsb to. */ |
83 | #define MOVSB_ALIGN_TO 64 |
84 | |
85 | #define SMALL_MOV_SIZE (MOV_SIZE <= 4) |
86 | #define LARGE_MOV_SIZE (MOV_SIZE > 4) |
87 | |
88 | #if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 |
89 | # error MOV_SIZE Unknown |
90 | #endif |
91 | |
92 | #if LARGE_MOV_SIZE |
93 | # define SMALL_SIZE_OFFSET (4) |
94 | #else |
95 | # define SMALL_SIZE_OFFSET (0) |
96 | #endif |
97 | |
98 | #ifndef PAGE_SIZE |
99 | # define PAGE_SIZE 4096 |
100 | #endif |
101 | |
102 | #if PAGE_SIZE != 4096 |
103 | # error Unsupported PAGE_SIZE |
104 | #endif |
105 | |
106 | #ifndef LOG_PAGE_SIZE |
107 | # define LOG_PAGE_SIZE 12 |
108 | #endif |
109 | |
110 | #if PAGE_SIZE != (1 << LOG_PAGE_SIZE) |
111 | # error Invalid LOG_PAGE_SIZE |
112 | #endif |
113 | |
114 | /* Byte per page for large_memcpy inner loop. */ |
115 | #if VEC_SIZE == 64 |
116 | # define LARGE_LOAD_SIZE (VEC_SIZE * 2) |
117 | #else |
118 | # define LARGE_LOAD_SIZE (VEC_SIZE * 4) |
119 | #endif |
120 | |
121 | /* Amount to shift rdx by to compare for memcpy_large_4x. */ |
122 | #ifndef LOG_4X_MEMCPY_THRESH |
123 | # define LOG_4X_MEMCPY_THRESH 4 |
124 | #endif |
125 | |
126 | /* Avoid short distance rep movsb only with non-SSE vector. */ |
127 | #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB |
128 | # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) |
129 | #else |
130 | # define AVOID_SHORT_DISTANCE_REP_MOVSB 0 |
131 | #endif |
132 | |
133 | #ifndef PREFETCH |
134 | # define PREFETCH(addr) prefetcht0 addr |
135 | #endif |
136 | |
137 | /* Assume 64-byte prefetch size. */ |
138 | #ifndef PREFETCH_SIZE |
139 | # define PREFETCH_SIZE 64 |
140 | #endif |
141 | |
142 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |
143 | |
144 | #if PREFETCH_SIZE == 64 |
145 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |
146 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
147 | PREFETCH ((offset)base) |
148 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |
149 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
150 | PREFETCH ((offset)base); \ |
151 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |
152 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |
153 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
154 | PREFETCH ((offset)base); \ |
155 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |
156 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |
157 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |
158 | # else |
159 | # error Unsupported PREFETCHED_LOAD_SIZE! |
160 | # endif |
161 | #else |
162 | # error Unsupported PREFETCH_SIZE! |
163 | #endif |
164 | |
165 | #if LARGE_LOAD_SIZE == (VEC_SIZE * 2) |
166 | # define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ |
167 | VMOVU (offset)base, vec0; \ |
168 | VMOVU ((offset) + VEC_SIZE)base, vec1; |
169 | # define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ |
170 | VMOVNT vec0, (offset)base; \ |
171 | VMOVNT vec1, ((offset) + VEC_SIZE)base; |
172 | #elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) |
173 | # define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
174 | VMOVU (offset)base, vec0; \ |
175 | VMOVU ((offset) + VEC_SIZE)base, vec1; \ |
176 | VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ |
177 | VMOVU ((offset) + VEC_SIZE * 3)base, vec3; |
178 | # define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
179 | VMOVNT vec0, (offset)base; \ |
180 | VMOVNT vec1, ((offset) + VEC_SIZE)base; \ |
181 | VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ |
182 | VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; |
183 | #else |
184 | # error Invalid LARGE_LOAD_SIZE |
185 | #endif |
186 | |
187 | #ifndef SECTION |
188 | # error SECTION is not defined! |
189 | #endif |
190 | |
191 | .section SECTION(.text),"ax" ,@progbits |
192 | #if defined SHARED && IS_IN (libc) |
193 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
194 | cmp %RDX_LP, %RCX_LP |
195 | jb HIDDEN_JUMPTARGET (__chk_fail) |
196 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
197 | #endif |
198 | |
199 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
200 | mov %RDI_LP, %RAX_LP |
201 | add %RDX_LP, %RAX_LP |
202 | jmp L(start) |
203 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
204 | |
205 | #if defined SHARED && IS_IN (libc) |
206 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
207 | cmp %RDX_LP, %RCX_LP |
208 | jb HIDDEN_JUMPTARGET (__chk_fail) |
209 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
210 | #endif |
211 | |
212 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |
213 | movq %rdi, %rax |
214 | L(start): |
215 | # ifdef __ILP32__ |
216 | /* Clear the upper 32 bits. */ |
217 | movl %edx, %edx |
218 | # endif |
219 | cmp $VEC_SIZE, %RDX_LP |
220 | jb L(less_vec) |
221 | /* Load regardless. */ |
222 | VMOVU (%rsi), %VEC(0) |
223 | cmp $(VEC_SIZE * 2), %RDX_LP |
224 | ja L(more_2x_vec) |
225 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
226 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
227 | VMOVU %VEC(0), (%rdi) |
228 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
229 | #if !(defined USE_MULTIARCH && IS_IN (libc)) |
230 | ZERO_UPPER_VEC_REGISTERS_RETURN |
231 | #else |
232 | VZEROUPPER_RETURN |
233 | #endif |
234 | #if defined USE_MULTIARCH && IS_IN (libc) |
235 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
236 | # if VEC_SIZE == 16 |
237 | ENTRY (__mempcpy_chk_erms) |
238 | cmp %RDX_LP, %RCX_LP |
239 | jb HIDDEN_JUMPTARGET (__chk_fail) |
240 | END (__mempcpy_chk_erms) |
241 | |
242 | /* Only used to measure performance of REP MOVSB. */ |
243 | ENTRY (__mempcpy_erms) |
244 | mov %RDI_LP, %RAX_LP |
245 | /* Skip zero length. */ |
246 | test %RDX_LP, %RDX_LP |
247 | jz 2f |
248 | add %RDX_LP, %RAX_LP |
249 | jmp L(start_movsb) |
250 | END (__mempcpy_erms) |
251 | |
252 | ENTRY (__memmove_chk_erms) |
253 | cmp %RDX_LP, %RCX_LP |
254 | jb HIDDEN_JUMPTARGET (__chk_fail) |
255 | END (__memmove_chk_erms) |
256 | |
257 | ENTRY (__memmove_erms) |
258 | movq %rdi, %rax |
259 | /* Skip zero length. */ |
260 | test %RDX_LP, %RDX_LP |
261 | jz 2f |
262 | L(start_movsb): |
263 | mov %RDX_LP, %RCX_LP |
264 | cmp %RSI_LP, %RDI_LP |
265 | jb 1f |
266 | /* Source == destination is less common. */ |
267 | je 2f |
268 | lea (%rsi,%rcx), %RDX_LP |
269 | cmp %RDX_LP, %RDI_LP |
270 | jb L(movsb_backward) |
271 | 1: |
272 | rep movsb |
273 | 2: |
274 | ret |
275 | L(movsb_backward): |
276 | leaq -1(%rdi,%rcx), %rdi |
277 | leaq -1(%rsi,%rcx), %rsi |
278 | std |
279 | rep movsb |
280 | cld |
281 | ret |
282 | END (__memmove_erms) |
283 | strong_alias (__memmove_erms, __memcpy_erms) |
284 | strong_alias (__memmove_chk_erms, __memcpy_chk_erms) |
285 | # endif |
286 | |
287 | # ifdef SHARED |
288 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
289 | cmp %RDX_LP, %RCX_LP |
290 | jb HIDDEN_JUMPTARGET (__chk_fail) |
291 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
292 | # endif |
293 | |
294 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
295 | mov %RDI_LP, %RAX_LP |
296 | add %RDX_LP, %RAX_LP |
297 | jmp L(start_erms) |
298 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
299 | |
300 | # ifdef SHARED |
301 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
302 | cmp %RDX_LP, %RCX_LP |
303 | jb HIDDEN_JUMPTARGET (__chk_fail) |
304 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
305 | # endif |
306 | |
307 | ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) |
308 | movq %rdi, %rax |
309 | L(start_erms): |
310 | # ifdef __ILP32__ |
311 | /* Clear the upper 32 bits. */ |
312 | movl %edx, %edx |
313 | # endif |
314 | cmp $VEC_SIZE, %RDX_LP |
315 | jb L(less_vec) |
316 | /* Load regardless. */ |
317 | VMOVU (%rsi), %VEC(0) |
318 | cmp $(VEC_SIZE * 2), %RDX_LP |
319 | ja L(movsb_more_2x_vec) |
320 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. |
321 | */ |
322 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) |
323 | VMOVU %VEC(0), (%rdi) |
324 | VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) |
325 | L(return): |
326 | # if VEC_SIZE > 16 |
327 | ZERO_UPPER_VEC_REGISTERS_RETURN |
328 | # else |
329 | ret |
330 | # endif |
331 | #endif |
332 | |
333 | #if LARGE_MOV_SIZE |
334 | /* If LARGE_MOV_SIZE this fits in the aligning bytes between the |
335 | ENTRY block and L(less_vec). */ |
336 | .p2align 4,, 8 |
337 | L(between_4_7): |
338 | /* From 4 to 7. No branch when size == 4. */ |
339 | movl (%rsi), %ecx |
340 | movl (%rsi, %rdx), %esi |
341 | movl %ecx, (%rdi) |
342 | movl %esi, (%rdi, %rdx) |
343 | ret |
344 | #endif |
345 | |
346 | .p2align 4 |
347 | L(less_vec): |
348 | /* Less than 1 VEC. */ |
349 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
350 | # error Unsupported VEC_SIZE! |
351 | #endif |
352 | #if VEC_SIZE > 32 |
353 | cmpl $32, %edx |
354 | jae L(between_32_63) |
355 | #endif |
356 | #if VEC_SIZE > 16 |
357 | cmpl $16, %edx |
358 | jae L(between_16_31) |
359 | #endif |
360 | cmpl $8, %edx |
361 | jae L(between_8_15) |
362 | #if SMALL_MOV_SIZE |
363 | cmpl $4, %edx |
364 | #else |
365 | subq $4, %rdx |
366 | #endif |
367 | jae L(between_4_7) |
368 | cmpl $(1 - SMALL_SIZE_OFFSET), %edx |
369 | jl L(copy_0) |
370 | movb (%rsi), %cl |
371 | je L(copy_1) |
372 | movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi |
373 | movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) |
374 | L(copy_1): |
375 | movb %cl, (%rdi) |
376 | L(copy_0): |
377 | ret |
378 | |
379 | #if SMALL_MOV_SIZE |
380 | .p2align 4,, 8 |
381 | L(between_4_7): |
382 | /* From 4 to 7. No branch when size == 4. */ |
383 | movl -4(%rsi, %rdx), %ecx |
384 | movl (%rsi), %esi |
385 | movl %ecx, -4(%rdi, %rdx) |
386 | movl %esi, (%rdi) |
387 | ret |
388 | #endif |
389 | |
390 | #if VEC_SIZE > 16 |
391 | /* From 16 to 31. No branch when size == 16. */ |
392 | .p2align 4,, 8 |
393 | L(between_16_31): |
394 | vmovdqu (%rsi), %xmm0 |
395 | vmovdqu -16(%rsi, %rdx), %xmm1 |
396 | vmovdqu %xmm0, (%rdi) |
397 | vmovdqu %xmm1, -16(%rdi, %rdx) |
398 | /* No ymm registers have been touched. */ |
399 | ret |
400 | #endif |
401 | |
402 | #if VEC_SIZE > 32 |
403 | .p2align 4,, 10 |
404 | L(between_32_63): |
405 | /* From 32 to 63. No branch when size == 32. */ |
406 | VMOVU (%rsi), %YMM0 |
407 | VMOVU -32(%rsi, %rdx), %YMM1 |
408 | VMOVU %YMM0, (%rdi) |
409 | VMOVU %YMM1, -32(%rdi, %rdx) |
410 | VZEROUPPER_RETURN |
411 | #endif |
412 | |
413 | .p2align 4,, 10 |
414 | L(between_8_15): |
415 | /* From 8 to 15. No branch when size == 8. */ |
416 | movq -8(%rsi, %rdx), %rcx |
417 | movq (%rsi), %rsi |
418 | movq %rsi, (%rdi) |
419 | movq %rcx, -8(%rdi, %rdx) |
420 | ret |
421 | |
422 | .p2align 4,, 10 |
423 | L(last_4x_vec): |
424 | /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |
425 | |
426 | /* VEC(0) and VEC(1) have already been loaded. */ |
427 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) |
428 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) |
429 | VMOVU %VEC(0), (%rdi) |
430 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
431 | VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) |
432 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) |
433 | VZEROUPPER_RETURN |
434 | |
435 | .p2align 4 |
436 | #if defined USE_MULTIARCH && IS_IN (libc) |
437 | L(movsb_more_2x_vec): |
438 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
439 | ja L(movsb) |
440 | #endif |
441 | L(more_2x_vec): |
442 | /* More than 2 * VEC and there may be overlap between |
443 | destination and source. */ |
444 | cmpq $(VEC_SIZE * 8), %rdx |
445 | ja L(more_8x_vec) |
446 | /* Load VEC(1) regardless. VEC(0) has already been loaded. */ |
447 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
448 | cmpq $(VEC_SIZE * 4), %rdx |
449 | jbe L(last_4x_vec) |
450 | /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |
451 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
452 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
453 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) |
454 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) |
455 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) |
456 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) |
457 | VMOVU %VEC(0), (%rdi) |
458 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
459 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
460 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
461 | VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) |
462 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) |
463 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) |
464 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) |
465 | VZEROUPPER_RETURN |
466 | |
467 | .p2align 4,, 4 |
468 | L(more_8x_vec): |
469 | movq %rdi, %rcx |
470 | subq %rsi, %rcx |
471 | /* Go to backwards temporal copy if overlap no matter what as |
472 | backward REP MOVSB is slow and we don't want to use NT stores if |
473 | there is overlap. */ |
474 | cmpq %rdx, %rcx |
475 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
476 | jb L(more_8x_vec_backward_check_nop) |
477 | /* Check if non-temporal move candidate. */ |
478 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
479 | /* Check non-temporal store threshold. */ |
480 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
481 | ja L(large_memcpy_2x) |
482 | #endif |
483 | /* To reach this point there cannot be overlap and dst > src. So |
484 | check for overlap and src > dst in which case correctness |
485 | requires forward copy. Otherwise decide between backward/forward |
486 | copy depending on address aliasing. */ |
487 | |
488 | /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold |
489 | but less than __x86_shared_non_temporal_threshold. */ |
490 | L(more_8x_vec_check): |
491 | /* rcx contains dst - src. Add back length (rdx). */ |
492 | leaq (%rcx, %rdx), %r8 |
493 | /* If r8 has different sign than rcx then there is overlap so we |
494 | must do forward copy. */ |
495 | xorq %rcx, %r8 |
496 | /* Isolate just sign bit of r8. */ |
497 | shrq $63, %r8 |
498 | /* Get 4k difference dst - src. */ |
499 | andl $(PAGE_SIZE - 256), %ecx |
500 | /* If r8 is non-zero must do foward for correctness. Otherwise |
501 | if ecx is non-zero there is 4k False Alaising so do backward |
502 | copy. */ |
503 | addl %r8d, %ecx |
504 | jz L(more_8x_vec_backward) |
505 | |
506 | /* if rdx is greater than __x86_shared_non_temporal_threshold |
507 | but there is overlap, or from short distance movsb. */ |
508 | L(more_8x_vec_forward): |
509 | /* Load first and last 4 * VEC to support overlapping addresses. |
510 | */ |
511 | |
512 | /* First vec was already loaded into VEC(0). */ |
513 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |
514 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |
515 | /* Save begining of dst. */ |
516 | movq %rdi, %rcx |
517 | /* Align dst to VEC_SIZE - 1. */ |
518 | orq $(VEC_SIZE - 1), %rdi |
519 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |
520 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |
521 | |
522 | /* Subtract dst from src. Add back after dst aligned. */ |
523 | subq %rcx, %rsi |
524 | /* Finish aligning dst. */ |
525 | incq %rdi |
526 | /* Restore src adjusted with new value for aligned dst. */ |
527 | addq %rdi, %rsi |
528 | /* Store end of buffer minus tail in rdx. */ |
529 | leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx |
530 | |
531 | /* Dont use multi-byte nop to align. */ |
532 | .p2align 4,, 11 |
533 | L(loop_4x_vec_forward): |
534 | /* Copy 4 * VEC a time forward. */ |
535 | VMOVU (%rsi), %VEC(1) |
536 | VMOVU VEC_SIZE(%rsi), %VEC(2) |
537 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) |
538 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) |
539 | subq $-(VEC_SIZE * 4), %rsi |
540 | VMOVA %VEC(1), (%rdi) |
541 | VMOVA %VEC(2), VEC_SIZE(%rdi) |
542 | VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) |
543 | VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) |
544 | subq $-(VEC_SIZE * 4), %rdi |
545 | cmpq %rdi, %rdx |
546 | ja L(loop_4x_vec_forward) |
547 | /* Store the last 4 * VEC. */ |
548 | VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) |
549 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) |
550 | VMOVU %VEC(7), VEC_SIZE(%rdx) |
551 | VMOVU %VEC(8), (%rdx) |
552 | /* Store the first VEC. */ |
553 | VMOVU %VEC(0), (%rcx) |
554 | /* Keep L(nop_backward) target close to jmp for 2-byte encoding. |
555 | */ |
556 | L(nop_backward): |
557 | VZEROUPPER_RETURN |
558 | |
559 | .p2align 4,, 8 |
560 | L(more_8x_vec_backward_check_nop): |
561 | /* rcx contains dst - src. Test for dst == src to skip all of |
562 | memmove. */ |
563 | testq %rcx, %rcx |
564 | jz L(nop_backward) |
565 | L(more_8x_vec_backward): |
566 | /* Load the first 4 * VEC and last VEC to support overlapping |
567 | addresses. */ |
568 | |
569 | /* First vec was also loaded into VEC(0). */ |
570 | VMOVU VEC_SIZE(%rsi), %VEC(5) |
571 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |
572 | /* Begining of region for 4x backward copy stored in rcx. */ |
573 | leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx |
574 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |
575 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) |
576 | /* Subtract dst from src. Add back after dst aligned. */ |
577 | subq %rdi, %rsi |
578 | /* Align dst. */ |
579 | andq $-(VEC_SIZE), %rcx |
580 | /* Restore src. */ |
581 | addq %rcx, %rsi |
582 | |
583 | /* Don't use multi-byte nop to align. */ |
584 | .p2align 4,, 11 |
585 | L(loop_4x_vec_backward): |
586 | /* Copy 4 * VEC a time backward. */ |
587 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) |
588 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
589 | VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) |
590 | VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) |
591 | addq $(VEC_SIZE * -4), %rsi |
592 | VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) |
593 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) |
594 | VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) |
595 | VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) |
596 | addq $(VEC_SIZE * -4), %rcx |
597 | cmpq %rcx, %rdi |
598 | jb L(loop_4x_vec_backward) |
599 | /* Store the first 4 * VEC. */ |
600 | VMOVU %VEC(0), (%rdi) |
601 | VMOVU %VEC(5), VEC_SIZE(%rdi) |
602 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |
603 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |
604 | /* Store the last VEC. */ |
605 | VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) |
606 | VZEROUPPER_RETURN |
607 | |
608 | #if defined USE_MULTIARCH && IS_IN (libc) |
609 | /* L(skip_short_movsb_check) is only used with ERMS. Not for |
610 | FSRM. */ |
611 | .p2align 5,, 16 |
612 | # if ALIGN_MOVSB |
613 | L(skip_short_movsb_check): |
614 | # if MOVSB_ALIGN_TO > VEC_SIZE |
615 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
616 | # endif |
617 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
618 | # error Unsupported MOVSB_ALIGN_TO |
619 | # endif |
620 | /* If CPU does not have FSRM two options for aligning. Align src |
621 | if dst and src 4k alias. Otherwise align dst. */ |
622 | testl $(PAGE_SIZE - 512), %ecx |
623 | jnz L(movsb_align_dst) |
624 | /* Fall through. dst and src 4k alias. It's better to align src |
625 | here because the bottleneck will be loads dues to the false |
626 | dependency on dst. */ |
627 | |
628 | /* rcx already has dst - src. */ |
629 | movq %rcx, %r9 |
630 | /* Add src to len. Subtract back after src aligned. -1 because |
631 | src is initially aligned to MOVSB_ALIGN_TO - 1. */ |
632 | leaq -1(%rsi, %rdx), %rcx |
633 | /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ |
634 | orq $(MOVSB_ALIGN_TO - 1), %rsi |
635 | /* Restore dst and len adjusted with new values for aligned dst. |
636 | */ |
637 | leaq 1(%rsi, %r9), %rdi |
638 | subq %rsi, %rcx |
639 | /* Finish aligning src. */ |
640 | incq %rsi |
641 | |
642 | rep movsb |
643 | |
644 | VMOVU %VEC(0), (%r8) |
645 | # if MOVSB_ALIGN_TO > VEC_SIZE |
646 | VMOVU %VEC(1), VEC_SIZE(%r8) |
647 | # endif |
648 | VZEROUPPER_RETURN |
649 | # endif |
650 | |
651 | .p2align 4,, 12 |
652 | L(movsb): |
653 | movq %rdi, %rcx |
654 | subq %rsi, %rcx |
655 | /* Go to backwards temporal copy if overlap no matter what as |
656 | backward REP MOVSB is slow and we don't want to use NT stores if |
657 | there is overlap. */ |
658 | cmpq %rdx, %rcx |
659 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
660 | jb L(more_8x_vec_backward_check_nop) |
661 | # if ALIGN_MOVSB |
662 | /* Save dest for storing aligning VECs later. */ |
663 | movq %rdi, %r8 |
664 | # endif |
665 | /* If above __x86_rep_movsb_stop_threshold most likely is |
666 | candidate for NT moves aswell. */ |
667 | cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |
668 | jae L(large_memcpy_2x_check) |
669 | # if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB |
670 | /* Only avoid short movsb if CPU has FSRM. */ |
671 | testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
672 | jz L(skip_short_movsb_check) |
673 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
674 | /* Avoid "rep movsb" if RCX, the distance between source and |
675 | destination, is N*4GB + [1..63] with N >= 0. */ |
676 | |
677 | /* ecx contains dst - src. Early check for backward copy |
678 | conditions means only case of slow movsb with src = dst + [0, |
679 | 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check |
680 | for that case. */ |
681 | cmpl $-64, %ecx |
682 | ja L(more_8x_vec_forward) |
683 | # endif |
684 | # endif |
685 | # if ALIGN_MOVSB |
686 | # if MOVSB_ALIGN_TO > VEC_SIZE |
687 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
688 | # endif |
689 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
690 | # error Unsupported MOVSB_ALIGN_TO |
691 | # endif |
692 | /* Fall through means cpu has FSRM. In that case exclusively |
693 | align destination. */ |
694 | L(movsb_align_dst): |
695 | /* Subtract dst from src. Add back after dst aligned. */ |
696 | subq %rdi, %rsi |
697 | /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ |
698 | addq $(MOVSB_ALIGN_TO - 1), %rdi |
699 | /* Add dst to len. Subtract back after dst aligned. */ |
700 | leaq (%r8, %rdx), %rcx |
701 | /* Finish aligning dst. */ |
702 | andq $-(MOVSB_ALIGN_TO), %rdi |
703 | /* Restore src and len adjusted with new values for aligned dst. |
704 | */ |
705 | addq %rdi, %rsi |
706 | subq %rdi, %rcx |
707 | |
708 | rep movsb |
709 | |
710 | /* Store VECs loaded for aligning. */ |
711 | VMOVU %VEC(0), (%r8) |
712 | # if MOVSB_ALIGN_TO > VEC_SIZE |
713 | VMOVU %VEC(1), VEC_SIZE(%r8) |
714 | # endif |
715 | VZEROUPPER_RETURN |
716 | # else /* !ALIGN_MOVSB. */ |
717 | L(skip_short_movsb_check): |
718 | mov %RDX_LP, %RCX_LP |
719 | rep movsb |
720 | ret |
721 | # endif |
722 | #endif |
723 | |
724 | .p2align 4,, 10 |
725 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
726 | L(large_memcpy_2x_check): |
727 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
728 | jb L(more_8x_vec_check) |
729 | L(large_memcpy_2x): |
730 | /* To reach this point it is impossible for dst > src and |
731 | overlap. Remaining to check is src > dst and overlap. rcx |
732 | already contains dst - src. Negate rcx to get src - dst. If |
733 | length > rcx then there is overlap and forward copy is best. */ |
734 | negq %rcx |
735 | cmpq %rcx, %rdx |
736 | ja L(more_8x_vec_forward) |
737 | |
738 | /* Cache align destination. First store the first 64 bytes then |
739 | adjust alignments. */ |
740 | |
741 | /* First vec was also loaded into VEC(0). */ |
742 | # if VEC_SIZE < 64 |
743 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
744 | # if VEC_SIZE < 32 |
745 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
746 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
747 | # endif |
748 | # endif |
749 | VMOVU %VEC(0), (%rdi) |
750 | # if VEC_SIZE < 64 |
751 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
752 | # if VEC_SIZE < 32 |
753 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
754 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
755 | # endif |
756 | # endif |
757 | |
758 | /* Adjust source, destination, and size. */ |
759 | movq %rdi, %r8 |
760 | andq $63, %r8 |
761 | /* Get the negative of offset for alignment. */ |
762 | subq $64, %r8 |
763 | /* Adjust source. */ |
764 | subq %r8, %rsi |
765 | /* Adjust destination which should be aligned now. */ |
766 | subq %r8, %rdi |
767 | /* Adjust length. */ |
768 | addq %r8, %rdx |
769 | |
770 | /* Test if source and destination addresses will alias. If they |
771 | do the larger pipeline in large_memcpy_4x alleviated the |
772 | performance drop. */ |
773 | |
774 | /* ecx contains -(dst - src). not ecx will return dst - src - 1 |
775 | which works for testing aliasing. */ |
776 | notl %ecx |
777 | testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx |
778 | jz L(large_memcpy_4x) |
779 | |
780 | movq %rdx, %r10 |
781 | shrq $LOG_4X_MEMCPY_THRESH, %r10 |
782 | cmp __x86_shared_non_temporal_threshold(%rip), %r10 |
783 | jae L(large_memcpy_4x) |
784 | |
785 | /* edx will store remainder size for copying tail. */ |
786 | andl $(PAGE_SIZE * 2 - 1), %edx |
787 | /* r10 stores outer loop counter. */ |
788 | shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 |
789 | /* Copy 4x VEC at a time from 2 pages. */ |
790 | .p2align 4 |
791 | L(loop_large_memcpy_2x_outer): |
792 | /* ecx stores inner loop counter. */ |
793 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
794 | L(loop_large_memcpy_2x_inner): |
795 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
796 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |
797 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
798 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) |
799 | /* Load vectors from rsi. */ |
800 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
801 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
802 | subq $-LARGE_LOAD_SIZE, %rsi |
803 | /* Non-temporal store vectors to rdi. */ |
804 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
805 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
806 | subq $-LARGE_LOAD_SIZE, %rdi |
807 | decl %ecx |
808 | jnz L(loop_large_memcpy_2x_inner) |
809 | addq $PAGE_SIZE, %rdi |
810 | addq $PAGE_SIZE, %rsi |
811 | decq %r10 |
812 | jne L(loop_large_memcpy_2x_outer) |
813 | sfence |
814 | |
815 | /* Check if only last 4 loads are needed. */ |
816 | cmpl $(VEC_SIZE * 4), %edx |
817 | jbe L(large_memcpy_2x_end) |
818 | |
819 | /* Handle the last 2 * PAGE_SIZE bytes. */ |
820 | L(loop_large_memcpy_2x_tail): |
821 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
822 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
823 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
824 | VMOVU (%rsi), %VEC(0) |
825 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
826 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
827 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
828 | subq $-(VEC_SIZE * 4), %rsi |
829 | addl $-(VEC_SIZE * 4), %edx |
830 | VMOVA %VEC(0), (%rdi) |
831 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
832 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
833 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
834 | subq $-(VEC_SIZE * 4), %rdi |
835 | cmpl $(VEC_SIZE * 4), %edx |
836 | ja L(loop_large_memcpy_2x_tail) |
837 | |
838 | L(large_memcpy_2x_end): |
839 | /* Store the last 4 * VEC. */ |
840 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
841 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
842 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
843 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
844 | |
845 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
846 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
847 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
848 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
849 | VZEROUPPER_RETURN |
850 | |
851 | .p2align 4 |
852 | L(large_memcpy_4x): |
853 | movq %rdx, %r10 |
854 | /* edx will store remainder size for copying tail. */ |
855 | andl $(PAGE_SIZE * 4 - 1), %edx |
856 | /* r10 stores outer loop counter. */ |
857 | shrq $(LOG_PAGE_SIZE + 2), %r10 |
858 | /* Copy 4x VEC at a time from 4 pages. */ |
859 | .p2align 4 |
860 | L(loop_large_memcpy_4x_outer): |
861 | /* ecx stores inner loop counter. */ |
862 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
863 | L(loop_large_memcpy_4x_inner): |
864 | /* Only one prefetch set per page as doing 4 pages give more |
865 | time for prefetcher to keep up. */ |
866 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
867 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
868 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) |
869 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) |
870 | /* Load vectors from rsi. */ |
871 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
872 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
873 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
874 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
875 | subq $-LARGE_LOAD_SIZE, %rsi |
876 | /* Non-temporal store vectors to rdi. */ |
877 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
878 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
879 | STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
880 | STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
881 | subq $-LARGE_LOAD_SIZE, %rdi |
882 | decl %ecx |
883 | jnz L(loop_large_memcpy_4x_inner) |
884 | addq $(PAGE_SIZE * 3), %rdi |
885 | addq $(PAGE_SIZE * 3), %rsi |
886 | decq %r10 |
887 | jne L(loop_large_memcpy_4x_outer) |
888 | sfence |
889 | /* Check if only last 4 loads are needed. */ |
890 | cmpl $(VEC_SIZE * 4), %edx |
891 | jbe L(large_memcpy_4x_end) |
892 | |
893 | /* Handle the last 4 * PAGE_SIZE bytes. */ |
894 | L(loop_large_memcpy_4x_tail): |
895 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
896 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
897 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
898 | VMOVU (%rsi), %VEC(0) |
899 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
900 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
901 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
902 | subq $-(VEC_SIZE * 4), %rsi |
903 | addl $-(VEC_SIZE * 4), %edx |
904 | VMOVA %VEC(0), (%rdi) |
905 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
906 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
907 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
908 | subq $-(VEC_SIZE * 4), %rdi |
909 | cmpl $(VEC_SIZE * 4), %edx |
910 | ja L(loop_large_memcpy_4x_tail) |
911 | |
912 | L(large_memcpy_4x_end): |
913 | /* Store the last 4 * VEC. */ |
914 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
915 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
916 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
917 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
918 | |
919 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
920 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
921 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
922 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
923 | VZEROUPPER_RETURN |
924 | #endif |
925 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
926 | |
927 | #if IS_IN (libc) |
928 | # ifdef USE_MULTIARCH |
929 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |
930 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |
931 | # ifdef SHARED |
932 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |
933 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |
934 | # endif |
935 | # endif |
936 | # ifdef SHARED |
937 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |
938 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |
939 | # endif |
940 | #endif |
941 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |
942 | MEMCPY_SYMBOL (__memcpy, unaligned)) |
943 | |