1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
2 | Copyright (C) 2016-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* memmove/memcpy/mempcpy is implemented as: |
20 | 1. Use overlapping load and store to avoid branch. |
21 | 2. Load all sources into registers and store them together to avoid |
22 | possible address overlap between source and destination. |
23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |
24 | and store them together. |
25 | 4. If address of destination > address of source, backward copy |
26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |
27 | Load the first 4 * VEC and last VEC before the loop and store |
28 | them after the loop to support overlapping addresses. |
29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |
30 | load and aligned store. Load the last 4 * VEC and first VEC |
31 | before the loop and store them after the loop to support |
32 | overlapping addresses. |
33 | 6. If size >= __x86_shared_non_temporal_threshold and there is no |
34 | overlap between destination and source, use non-temporal store |
35 | instead of aligned store. */ |
36 | |
37 | #include <sysdep.h> |
38 | |
39 | #ifndef MEMCPY_SYMBOL |
40 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
41 | #endif |
42 | |
43 | #ifndef MEMPCPY_SYMBOL |
44 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
45 | #endif |
46 | |
47 | #ifndef MEMMOVE_CHK_SYMBOL |
48 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
49 | #endif |
50 | |
51 | #ifndef VZEROUPPER |
52 | # if VEC_SIZE > 16 |
53 | # define VZEROUPPER vzeroupper |
54 | # else |
55 | # define VZEROUPPER |
56 | # endif |
57 | #endif |
58 | |
59 | /* Avoid short distance rep movsb only with non-SSE vector. */ |
60 | #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB |
61 | # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) |
62 | #else |
63 | # define AVOID_SHORT_DISTANCE_REP_MOVSB 0 |
64 | #endif |
65 | |
66 | #ifndef PREFETCH |
67 | # define PREFETCH(addr) prefetcht0 addr |
68 | #endif |
69 | |
70 | /* Assume 64-byte prefetch size. */ |
71 | #ifndef PREFETCH_SIZE |
72 | # define PREFETCH_SIZE 64 |
73 | #endif |
74 | |
75 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |
76 | |
77 | #if PREFETCH_SIZE == 64 |
78 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |
79 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
80 | PREFETCH ((offset)base) |
81 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |
82 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
83 | PREFETCH ((offset)base); \ |
84 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |
85 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |
86 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
87 | PREFETCH ((offset)base); \ |
88 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |
89 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |
90 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |
91 | # else |
92 | # error Unsupported PREFETCHED_LOAD_SIZE! |
93 | # endif |
94 | #else |
95 | # error Unsupported PREFETCH_SIZE! |
96 | #endif |
97 | |
98 | #ifndef SECTION |
99 | # error SECTION is not defined! |
100 | #endif |
101 | |
102 | .section SECTION(.text),"ax" ,@progbits |
103 | #if defined SHARED && IS_IN (libc) |
104 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
105 | cmp %RDX_LP, %RCX_LP |
106 | jb HIDDEN_JUMPTARGET (__chk_fail) |
107 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
108 | #endif |
109 | |
110 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
111 | mov %RDI_LP, %RAX_LP |
112 | add %RDX_LP, %RAX_LP |
113 | jmp L(start) |
114 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
115 | |
116 | #if defined SHARED && IS_IN (libc) |
117 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
118 | cmp %RDX_LP, %RCX_LP |
119 | jb HIDDEN_JUMPTARGET (__chk_fail) |
120 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
121 | #endif |
122 | |
123 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |
124 | movq %rdi, %rax |
125 | L(start): |
126 | # ifdef __ILP32__ |
127 | /* Clear the upper 32 bits. */ |
128 | movl %edx, %edx |
129 | # endif |
130 | cmp $VEC_SIZE, %RDX_LP |
131 | jb L(less_vec) |
132 | cmp $(VEC_SIZE * 2), %RDX_LP |
133 | ja L(more_2x_vec) |
134 | #if !defined USE_MULTIARCH || !IS_IN (libc) |
135 | L(last_2x_vec): |
136 | #endif |
137 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
138 | VMOVU (%rsi), %VEC(0) |
139 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
140 | VMOVU %VEC(0), (%rdi) |
141 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
142 | VZEROUPPER |
143 | #if !defined USE_MULTIARCH || !IS_IN (libc) |
144 | L(nop): |
145 | #endif |
146 | ret |
147 | #if defined USE_MULTIARCH && IS_IN (libc) |
148 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
149 | |
150 | # if VEC_SIZE == 16 |
151 | ENTRY (__mempcpy_chk_erms) |
152 | cmp %RDX_LP, %RCX_LP |
153 | jb HIDDEN_JUMPTARGET (__chk_fail) |
154 | END (__mempcpy_chk_erms) |
155 | |
156 | /* Only used to measure performance of REP MOVSB. */ |
157 | ENTRY (__mempcpy_erms) |
158 | mov %RDI_LP, %RAX_LP |
159 | /* Skip zero length. */ |
160 | test %RDX_LP, %RDX_LP |
161 | jz 2f |
162 | add %RDX_LP, %RAX_LP |
163 | jmp L(start_movsb) |
164 | END (__mempcpy_erms) |
165 | |
166 | ENTRY (__memmove_chk_erms) |
167 | cmp %RDX_LP, %RCX_LP |
168 | jb HIDDEN_JUMPTARGET (__chk_fail) |
169 | END (__memmove_chk_erms) |
170 | |
171 | ENTRY (__memmove_erms) |
172 | movq %rdi, %rax |
173 | /* Skip zero length. */ |
174 | test %RDX_LP, %RDX_LP |
175 | jz 2f |
176 | L(start_movsb): |
177 | mov %RDX_LP, %RCX_LP |
178 | cmp %RSI_LP, %RDI_LP |
179 | jb 1f |
180 | /* Source == destination is less common. */ |
181 | je 2f |
182 | lea (%rsi,%rcx), %RDX_LP |
183 | cmp %RDX_LP, %RDI_LP |
184 | jb L(movsb_backward) |
185 | 1: |
186 | rep movsb |
187 | 2: |
188 | ret |
189 | L(movsb_backward): |
190 | leaq -1(%rdi,%rcx), %rdi |
191 | leaq -1(%rsi,%rcx), %rsi |
192 | std |
193 | rep movsb |
194 | cld |
195 | ret |
196 | END (__memmove_erms) |
197 | strong_alias (__memmove_erms, __memcpy_erms) |
198 | strong_alias (__memmove_chk_erms, __memcpy_chk_erms) |
199 | # endif |
200 | |
201 | # ifdef SHARED |
202 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
203 | cmp %RDX_LP, %RCX_LP |
204 | jb HIDDEN_JUMPTARGET (__chk_fail) |
205 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
206 | # endif |
207 | |
208 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
209 | mov %RDI_LP, %RAX_LP |
210 | add %RDX_LP, %RAX_LP |
211 | jmp L(start_erms) |
212 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
213 | |
214 | # ifdef SHARED |
215 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
216 | cmp %RDX_LP, %RCX_LP |
217 | jb HIDDEN_JUMPTARGET (__chk_fail) |
218 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
219 | # endif |
220 | |
221 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
222 | movq %rdi, %rax |
223 | L(start_erms): |
224 | # ifdef __ILP32__ |
225 | /* Clear the upper 32 bits. */ |
226 | movl %edx, %edx |
227 | # endif |
228 | cmp $VEC_SIZE, %RDX_LP |
229 | jb L(less_vec) |
230 | cmp $(VEC_SIZE * 2), %RDX_LP |
231 | ja L(movsb_more_2x_vec) |
232 | L(last_2x_vec): |
233 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
234 | VMOVU (%rsi), %VEC(0) |
235 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
236 | VMOVU %VEC(0), (%rdi) |
237 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
238 | L(return): |
239 | VZEROUPPER |
240 | ret |
241 | |
242 | L(movsb): |
243 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
244 | jae L(more_8x_vec) |
245 | cmpq %rsi, %rdi |
246 | jb 1f |
247 | /* Source == destination is less common. */ |
248 | je L(nop) |
249 | leaq (%rsi,%rdx), %r9 |
250 | cmpq %r9, %rdi |
251 | /* Avoid slow backward REP MOVSB. */ |
252 | jb L(more_8x_vec_backward) |
253 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
254 | movq %rdi, %rcx |
255 | subq %rsi, %rcx |
256 | jmp 2f |
257 | # endif |
258 | 1: |
259 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
260 | movq %rsi, %rcx |
261 | subq %rdi, %rcx |
262 | 2: |
263 | /* Avoid "rep movsb" if RCX, the distance between source and destination, |
264 | is N*4GB + [1..63] with N >= 0. */ |
265 | cmpl $63, %ecx |
266 | jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ |
267 | # endif |
268 | mov %RDX_LP, %RCX_LP |
269 | rep movsb |
270 | L(nop): |
271 | ret |
272 | #endif |
273 | |
274 | L(less_vec): |
275 | /* Less than 1 VEC. */ |
276 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
277 | # error Unsupported VEC_SIZE! |
278 | #endif |
279 | #if VEC_SIZE > 32 |
280 | cmpb $32, %dl |
281 | jae L(between_32_63) |
282 | #endif |
283 | #if VEC_SIZE > 16 |
284 | cmpb $16, %dl |
285 | jae L(between_16_31) |
286 | #endif |
287 | cmpb $8, %dl |
288 | jae L(between_8_15) |
289 | cmpb $4, %dl |
290 | jae L(between_4_7) |
291 | cmpb $1, %dl |
292 | ja L(between_2_3) |
293 | jb 1f |
294 | movzbl (%rsi), %ecx |
295 | movb %cl, (%rdi) |
296 | 1: |
297 | ret |
298 | #if VEC_SIZE > 32 |
299 | L(between_32_63): |
300 | /* From 32 to 63. No branch when size == 32. */ |
301 | vmovdqu (%rsi), %ymm0 |
302 | vmovdqu -32(%rsi,%rdx), %ymm1 |
303 | vmovdqu %ymm0, (%rdi) |
304 | vmovdqu %ymm1, -32(%rdi,%rdx) |
305 | VZEROUPPER |
306 | ret |
307 | #endif |
308 | #if VEC_SIZE > 16 |
309 | /* From 16 to 31. No branch when size == 16. */ |
310 | L(between_16_31): |
311 | vmovdqu (%rsi), %xmm0 |
312 | vmovdqu -16(%rsi,%rdx), %xmm1 |
313 | vmovdqu %xmm0, (%rdi) |
314 | vmovdqu %xmm1, -16(%rdi,%rdx) |
315 | ret |
316 | #endif |
317 | L(between_8_15): |
318 | /* From 8 to 15. No branch when size == 8. */ |
319 | movq -8(%rsi,%rdx), %rcx |
320 | movq (%rsi), %rsi |
321 | movq %rcx, -8(%rdi,%rdx) |
322 | movq %rsi, (%rdi) |
323 | ret |
324 | L(between_4_7): |
325 | /* From 4 to 7. No branch when size == 4. */ |
326 | movl -4(%rsi,%rdx), %ecx |
327 | movl (%rsi), %esi |
328 | movl %ecx, -4(%rdi,%rdx) |
329 | movl %esi, (%rdi) |
330 | ret |
331 | L(between_2_3): |
332 | /* From 2 to 3. No branch when size == 2. */ |
333 | movzwl -2(%rsi,%rdx), %ecx |
334 | movzwl (%rsi), %esi |
335 | movw %cx, -2(%rdi,%rdx) |
336 | movw %si, (%rdi) |
337 | ret |
338 | |
339 | #if defined USE_MULTIARCH && IS_IN (libc) |
340 | L(movsb_more_2x_vec): |
341 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
342 | ja L(movsb) |
343 | #endif |
344 | L(more_2x_vec): |
345 | /* More than 2 * VEC and there may be overlap between destination |
346 | and source. */ |
347 | cmpq $(VEC_SIZE * 8), %rdx |
348 | ja L(more_8x_vec) |
349 | cmpq $(VEC_SIZE * 4), %rdx |
350 | jb L(last_4x_vec) |
351 | /* Copy from 4 * VEC to 8 * VEC, inclusively. */ |
352 | VMOVU (%rsi), %VEC(0) |
353 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
354 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
355 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
356 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) |
357 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) |
358 | VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) |
359 | VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) |
360 | VMOVU %VEC(0), (%rdi) |
361 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
362 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
363 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
364 | VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) |
365 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) |
366 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) |
367 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) |
368 | VZEROUPPER |
369 | ret |
370 | L(last_4x_vec): |
371 | /* Copy from 2 * VEC to 4 * VEC. */ |
372 | VMOVU (%rsi), %VEC(0) |
373 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
374 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) |
375 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) |
376 | VMOVU %VEC(0), (%rdi) |
377 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
378 | VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) |
379 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) |
380 | VZEROUPPER |
381 | ret |
382 | |
383 | L(more_8x_vec): |
384 | cmpq %rsi, %rdi |
385 | ja L(more_8x_vec_backward) |
386 | /* Source == destination is less common. */ |
387 | je L(nop) |
388 | /* Load the first VEC and last 4 * VEC to support overlapping |
389 | addresses. */ |
390 | VMOVU (%rsi), %VEC(4) |
391 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |
392 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |
393 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |
394 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |
395 | /* Save start and stop of the destination buffer. */ |
396 | movq %rdi, %r11 |
397 | leaq -VEC_SIZE(%rdi, %rdx), %rcx |
398 | /* Align destination for aligned stores in the loop. Compute |
399 | how much destination is misaligned. */ |
400 | movq %rdi, %r8 |
401 | andq $(VEC_SIZE - 1), %r8 |
402 | /* Get the negative of offset for alignment. */ |
403 | subq $VEC_SIZE, %r8 |
404 | /* Adjust source. */ |
405 | subq %r8, %rsi |
406 | /* Adjust destination which should be aligned now. */ |
407 | subq %r8, %rdi |
408 | /* Adjust length. */ |
409 | addq %r8, %rdx |
410 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
411 | /* Check non-temporal store threshold. */ |
412 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
413 | ja L(large_forward) |
414 | #endif |
415 | L(loop_4x_vec_forward): |
416 | /* Copy 4 * VEC a time forward. */ |
417 | VMOVU (%rsi), %VEC(0) |
418 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
419 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
420 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
421 | addq $(VEC_SIZE * 4), %rsi |
422 | subq $(VEC_SIZE * 4), %rdx |
423 | VMOVA %VEC(0), (%rdi) |
424 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
425 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
426 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
427 | addq $(VEC_SIZE * 4), %rdi |
428 | cmpq $(VEC_SIZE * 4), %rdx |
429 | ja L(loop_4x_vec_forward) |
430 | /* Store the last 4 * VEC. */ |
431 | VMOVU %VEC(5), (%rcx) |
432 | VMOVU %VEC(6), -VEC_SIZE(%rcx) |
433 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |
434 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |
435 | /* Store the first VEC. */ |
436 | VMOVU %VEC(4), (%r11) |
437 | VZEROUPPER |
438 | ret |
439 | |
440 | L(more_8x_vec_backward): |
441 | /* Load the first 4 * VEC and last VEC to support overlapping |
442 | addresses. */ |
443 | VMOVU (%rsi), %VEC(4) |
444 | VMOVU VEC_SIZE(%rsi), %VEC(5) |
445 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |
446 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |
447 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) |
448 | /* Save stop of the destination buffer. */ |
449 | leaq -VEC_SIZE(%rdi, %rdx), %r11 |
450 | /* Align destination end for aligned stores in the loop. Compute |
451 | how much destination end is misaligned. */ |
452 | leaq -VEC_SIZE(%rsi, %rdx), %rcx |
453 | movq %r11, %r9 |
454 | movq %r11, %r8 |
455 | andq $(VEC_SIZE - 1), %r8 |
456 | /* Adjust source. */ |
457 | subq %r8, %rcx |
458 | /* Adjust the end of destination which should be aligned now. */ |
459 | subq %r8, %r9 |
460 | /* Adjust length. */ |
461 | subq %r8, %rdx |
462 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
463 | /* Check non-temporal store threshold. */ |
464 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
465 | ja L(large_backward) |
466 | #endif |
467 | L(loop_4x_vec_backward): |
468 | /* Copy 4 * VEC a time backward. */ |
469 | VMOVU (%rcx), %VEC(0) |
470 | VMOVU -VEC_SIZE(%rcx), %VEC(1) |
471 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |
472 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |
473 | subq $(VEC_SIZE * 4), %rcx |
474 | subq $(VEC_SIZE * 4), %rdx |
475 | VMOVA %VEC(0), (%r9) |
476 | VMOVA %VEC(1), -VEC_SIZE(%r9) |
477 | VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) |
478 | VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) |
479 | subq $(VEC_SIZE * 4), %r9 |
480 | cmpq $(VEC_SIZE * 4), %rdx |
481 | ja L(loop_4x_vec_backward) |
482 | /* Store the first 4 * VEC. */ |
483 | VMOVU %VEC(4), (%rdi) |
484 | VMOVU %VEC(5), VEC_SIZE(%rdi) |
485 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |
486 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |
487 | /* Store the last VEC. */ |
488 | VMOVU %VEC(8), (%r11) |
489 | VZEROUPPER |
490 | ret |
491 | |
492 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
493 | L(large_forward): |
494 | /* Don't use non-temporal store if there is overlap between |
495 | destination and source since destination may be in cache |
496 | when source is loaded. */ |
497 | leaq (%rdi, %rdx), %r10 |
498 | cmpq %r10, %rsi |
499 | jb L(loop_4x_vec_forward) |
500 | L(loop_large_forward): |
501 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
502 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |
503 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) |
504 | VMOVU (%rsi), %VEC(0) |
505 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
506 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
507 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
508 | addq $PREFETCHED_LOAD_SIZE, %rsi |
509 | subq $PREFETCHED_LOAD_SIZE, %rdx |
510 | VMOVNT %VEC(0), (%rdi) |
511 | VMOVNT %VEC(1), VEC_SIZE(%rdi) |
512 | VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) |
513 | VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) |
514 | addq $PREFETCHED_LOAD_SIZE, %rdi |
515 | cmpq $PREFETCHED_LOAD_SIZE, %rdx |
516 | ja L(loop_large_forward) |
517 | sfence |
518 | /* Store the last 4 * VEC. */ |
519 | VMOVU %VEC(5), (%rcx) |
520 | VMOVU %VEC(6), -VEC_SIZE(%rcx) |
521 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |
522 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |
523 | /* Store the first VEC. */ |
524 | VMOVU %VEC(4), (%r11) |
525 | VZEROUPPER |
526 | ret |
527 | |
528 | L(large_backward): |
529 | /* Don't use non-temporal store if there is overlap between |
530 | destination and source since destination may be in cache |
531 | when source is loaded. */ |
532 | leaq (%rcx, %rdx), %r10 |
533 | cmpq %r10, %r9 |
534 | jb L(loop_4x_vec_backward) |
535 | L(loop_large_backward): |
536 | /* Copy 4 * VEC a time backward with non-temporal stores. */ |
537 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) |
538 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) |
539 | VMOVU (%rcx), %VEC(0) |
540 | VMOVU -VEC_SIZE(%rcx), %VEC(1) |
541 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |
542 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |
543 | subq $PREFETCHED_LOAD_SIZE, %rcx |
544 | subq $PREFETCHED_LOAD_SIZE, %rdx |
545 | VMOVNT %VEC(0), (%r9) |
546 | VMOVNT %VEC(1), -VEC_SIZE(%r9) |
547 | VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) |
548 | VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) |
549 | subq $PREFETCHED_LOAD_SIZE, %r9 |
550 | cmpq $PREFETCHED_LOAD_SIZE, %rdx |
551 | ja L(loop_large_backward) |
552 | sfence |
553 | /* Store the first 4 * VEC. */ |
554 | VMOVU %VEC(4), (%rdi) |
555 | VMOVU %VEC(5), VEC_SIZE(%rdi) |
556 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |
557 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |
558 | /* Store the last VEC. */ |
559 | VMOVU %VEC(8), (%r11) |
560 | VZEROUPPER |
561 | ret |
562 | #endif |
563 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
564 | |
565 | #if IS_IN (libc) |
566 | # ifdef USE_MULTIARCH |
567 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |
568 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |
569 | # ifdef SHARED |
570 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |
571 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |
572 | # endif |
573 | # endif |
574 | # ifdef SHARED |
575 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |
576 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |
577 | # endif |
578 | #endif |
579 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |
580 | MEMCPY_SYMBOL (__memcpy, unaligned)) |
581 | |