1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
2 | Copyright (C) 2016-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* memmove/memcpy/mempcpy is implemented as: |
20 | 1. Use overlapping load and store to avoid branch. |
21 | 2. Load all sources into registers and store them together to avoid |
22 | possible address overlap between source and destination. |
23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |
24 | and store them together. |
25 | 4. If address of destination > address of source, backward copy |
26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |
27 | Load the first 4 * VEC and last VEC before the loop and store |
28 | them after the loop to support overlapping addresses. |
29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |
30 | load and aligned store. Load the last 4 * VEC and first VEC |
31 | before the loop and store them after the loop to support |
32 | overlapping addresses. |
33 | 6. On machines with ERMS feature, if size greater than equal or to |
34 | __x86_rep_movsb_threshold and less than |
35 | __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. |
36 | 7. If size >= __x86_shared_non_temporal_threshold and there is no |
37 | overlap between destination and source, use non-temporal store |
38 | instead of aligned store copying from either 2 or 4 pages at |
39 | once. |
40 | 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold |
41 | and source and destination do not page alias, copy from 2 pages |
42 | at once using non-temporal stores. Page aliasing in this case is |
43 | considered true if destination's page alignment - sources' page |
44 | alignment is less than 8 * VEC_SIZE. |
45 | 9. If size >= 16 * __x86_shared_non_temporal_threshold or source |
46 | and destination do page alias copy from 4 pages at once using |
47 | non-temporal stores. */ |
48 | |
49 | #include <sysdep.h> |
50 | |
51 | #ifndef MEMCPY_SYMBOL |
52 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
53 | #endif |
54 | |
55 | #ifndef MEMPCPY_SYMBOL |
56 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
57 | #endif |
58 | |
59 | #ifndef MEMMOVE_CHK_SYMBOL |
60 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
61 | #endif |
62 | |
63 | #ifndef XMM0 |
64 | # define XMM0 xmm0 |
65 | #endif |
66 | |
67 | #ifndef YMM0 |
68 | # define YMM0 ymm0 |
69 | #endif |
70 | |
71 | #ifndef VZEROUPPER |
72 | # if VEC_SIZE > 16 |
73 | # define VZEROUPPER vzeroupper |
74 | # else |
75 | # define VZEROUPPER |
76 | # endif |
77 | #endif |
78 | |
79 | #ifndef PAGE_SIZE |
80 | # define PAGE_SIZE 4096 |
81 | #endif |
82 | |
83 | #if PAGE_SIZE != 4096 |
84 | # error Unsupported PAGE_SIZE |
85 | #endif |
86 | |
87 | #ifndef LOG_PAGE_SIZE |
88 | # define LOG_PAGE_SIZE 12 |
89 | #endif |
90 | |
91 | #if PAGE_SIZE != (1 << LOG_PAGE_SIZE) |
92 | # error Invalid LOG_PAGE_SIZE |
93 | #endif |
94 | |
95 | /* Byte per page for large_memcpy inner loop. */ |
96 | #if VEC_SIZE == 64 |
97 | # define LARGE_LOAD_SIZE (VEC_SIZE * 2) |
98 | #else |
99 | # define LARGE_LOAD_SIZE (VEC_SIZE * 4) |
100 | #endif |
101 | |
102 | /* Amount to shift rdx by to compare for memcpy_large_4x. */ |
103 | #ifndef LOG_4X_MEMCPY_THRESH |
104 | # define LOG_4X_MEMCPY_THRESH 4 |
105 | #endif |
106 | |
107 | /* Avoid short distance rep movsb only with non-SSE vector. */ |
108 | #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB |
109 | # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) |
110 | #else |
111 | # define AVOID_SHORT_DISTANCE_REP_MOVSB 0 |
112 | #endif |
113 | |
114 | #ifndef PREFETCH |
115 | # define PREFETCH(addr) prefetcht0 addr |
116 | #endif |
117 | |
118 | /* Assume 64-byte prefetch size. */ |
119 | #ifndef PREFETCH_SIZE |
120 | # define PREFETCH_SIZE 64 |
121 | #endif |
122 | |
123 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |
124 | |
125 | #if PREFETCH_SIZE == 64 |
126 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |
127 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
128 | PREFETCH ((offset)base) |
129 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |
130 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
131 | PREFETCH ((offset)base); \ |
132 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |
133 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |
134 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
135 | PREFETCH ((offset)base); \ |
136 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |
137 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |
138 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |
139 | # else |
140 | # error Unsupported PREFETCHED_LOAD_SIZE! |
141 | # endif |
142 | #else |
143 | # error Unsupported PREFETCH_SIZE! |
144 | #endif |
145 | |
146 | #if LARGE_LOAD_SIZE == (VEC_SIZE * 2) |
147 | # define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ |
148 | VMOVU (offset)base, vec0; \ |
149 | VMOVU ((offset) + VEC_SIZE)base, vec1; |
150 | # define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ |
151 | VMOVNT vec0, (offset)base; \ |
152 | VMOVNT vec1, ((offset) + VEC_SIZE)base; |
153 | #elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) |
154 | # define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
155 | VMOVU (offset)base, vec0; \ |
156 | VMOVU ((offset) + VEC_SIZE)base, vec1; \ |
157 | VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ |
158 | VMOVU ((offset) + VEC_SIZE * 3)base, vec3; |
159 | # define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
160 | VMOVNT vec0, (offset)base; \ |
161 | VMOVNT vec1, ((offset) + VEC_SIZE)base; \ |
162 | VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ |
163 | VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; |
164 | #else |
165 | # error Invalid LARGE_LOAD_SIZE |
166 | #endif |
167 | |
168 | #ifndef SECTION |
169 | # error SECTION is not defined! |
170 | #endif |
171 | |
172 | .section SECTION(.text),"ax" ,@progbits |
173 | #if defined SHARED && IS_IN (libc) |
174 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
175 | cmp %RDX_LP, %RCX_LP |
176 | jb HIDDEN_JUMPTARGET (__chk_fail) |
177 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
178 | #endif |
179 | |
180 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
181 | mov %RDI_LP, %RAX_LP |
182 | add %RDX_LP, %RAX_LP |
183 | jmp L(start) |
184 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
185 | |
186 | #if defined SHARED && IS_IN (libc) |
187 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
188 | cmp %RDX_LP, %RCX_LP |
189 | jb HIDDEN_JUMPTARGET (__chk_fail) |
190 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
191 | #endif |
192 | |
193 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |
194 | movq %rdi, %rax |
195 | L(start): |
196 | # ifdef __ILP32__ |
197 | /* Clear the upper 32 bits. */ |
198 | movl %edx, %edx |
199 | # endif |
200 | cmp $VEC_SIZE, %RDX_LP |
201 | jb L(less_vec) |
202 | cmp $(VEC_SIZE * 2), %RDX_LP |
203 | ja L(more_2x_vec) |
204 | #if !defined USE_MULTIARCH || !IS_IN (libc) |
205 | L(last_2x_vec): |
206 | #endif |
207 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
208 | VMOVU (%rsi), %VEC(0) |
209 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
210 | VMOVU %VEC(0), (%rdi) |
211 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
212 | #if !defined USE_MULTIARCH || !IS_IN (libc) |
213 | L(nop): |
214 | ret |
215 | #else |
216 | VZEROUPPER_RETURN |
217 | #endif |
218 | #if defined USE_MULTIARCH && IS_IN (libc) |
219 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
220 | |
221 | # if VEC_SIZE == 16 |
222 | ENTRY (__mempcpy_chk_erms) |
223 | cmp %RDX_LP, %RCX_LP |
224 | jb HIDDEN_JUMPTARGET (__chk_fail) |
225 | END (__mempcpy_chk_erms) |
226 | |
227 | /* Only used to measure performance of REP MOVSB. */ |
228 | ENTRY (__mempcpy_erms) |
229 | mov %RDI_LP, %RAX_LP |
230 | /* Skip zero length. */ |
231 | test %RDX_LP, %RDX_LP |
232 | jz 2f |
233 | add %RDX_LP, %RAX_LP |
234 | jmp L(start_movsb) |
235 | END (__mempcpy_erms) |
236 | |
237 | ENTRY (__memmove_chk_erms) |
238 | cmp %RDX_LP, %RCX_LP |
239 | jb HIDDEN_JUMPTARGET (__chk_fail) |
240 | END (__memmove_chk_erms) |
241 | |
242 | ENTRY (__memmove_erms) |
243 | movq %rdi, %rax |
244 | /* Skip zero length. */ |
245 | test %RDX_LP, %RDX_LP |
246 | jz 2f |
247 | L(start_movsb): |
248 | mov %RDX_LP, %RCX_LP |
249 | cmp %RSI_LP, %RDI_LP |
250 | jb 1f |
251 | /* Source == destination is less common. */ |
252 | je 2f |
253 | lea (%rsi,%rcx), %RDX_LP |
254 | cmp %RDX_LP, %RDI_LP |
255 | jb L(movsb_backward) |
256 | 1: |
257 | rep movsb |
258 | 2: |
259 | ret |
260 | L(movsb_backward): |
261 | leaq -1(%rdi,%rcx), %rdi |
262 | leaq -1(%rsi,%rcx), %rsi |
263 | std |
264 | rep movsb |
265 | cld |
266 | ret |
267 | END (__memmove_erms) |
268 | strong_alias (__memmove_erms, __memcpy_erms) |
269 | strong_alias (__memmove_chk_erms, __memcpy_chk_erms) |
270 | # endif |
271 | |
272 | # ifdef SHARED |
273 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
274 | cmp %RDX_LP, %RCX_LP |
275 | jb HIDDEN_JUMPTARGET (__chk_fail) |
276 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
277 | # endif |
278 | |
279 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
280 | mov %RDI_LP, %RAX_LP |
281 | add %RDX_LP, %RAX_LP |
282 | jmp L(start_erms) |
283 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
284 | |
285 | # ifdef SHARED |
286 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
287 | cmp %RDX_LP, %RCX_LP |
288 | jb HIDDEN_JUMPTARGET (__chk_fail) |
289 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
290 | # endif |
291 | |
292 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
293 | movq %rdi, %rax |
294 | L(start_erms): |
295 | # ifdef __ILP32__ |
296 | /* Clear the upper 32 bits. */ |
297 | movl %edx, %edx |
298 | # endif |
299 | cmp $VEC_SIZE, %RDX_LP |
300 | jb L(less_vec) |
301 | cmp $(VEC_SIZE * 2), %RDX_LP |
302 | ja L(movsb_more_2x_vec) |
303 | L(last_2x_vec): |
304 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
305 | VMOVU (%rsi), %VEC(0) |
306 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
307 | VMOVU %VEC(0), (%rdi) |
308 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
309 | L(return): |
310 | #if VEC_SIZE > 16 |
311 | ZERO_UPPER_VEC_REGISTERS_RETURN |
312 | #else |
313 | ret |
314 | #endif |
315 | |
316 | L(movsb): |
317 | cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |
318 | jae L(more_8x_vec) |
319 | cmpq %rsi, %rdi |
320 | jb 1f |
321 | /* Source == destination is less common. */ |
322 | je L(nop) |
323 | leaq (%rsi,%rdx), %r9 |
324 | cmpq %r9, %rdi |
325 | /* Avoid slow backward REP MOVSB. */ |
326 | jb L(more_8x_vec_backward) |
327 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
328 | andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
329 | jz 3f |
330 | movq %rdi, %rcx |
331 | subq %rsi, %rcx |
332 | jmp 2f |
333 | # endif |
334 | 1: |
335 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
336 | andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
337 | jz 3f |
338 | movq %rsi, %rcx |
339 | subq %rdi, %rcx |
340 | 2: |
341 | /* Avoid "rep movsb" if RCX, the distance between source and destination, |
342 | is N*4GB + [1..63] with N >= 0. */ |
343 | cmpl $63, %ecx |
344 | jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ |
345 | 3: |
346 | # endif |
347 | mov %RDX_LP, %RCX_LP |
348 | rep movsb |
349 | L(nop): |
350 | ret |
351 | #endif |
352 | |
353 | L(less_vec): |
354 | /* Less than 1 VEC. */ |
355 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
356 | # error Unsupported VEC_SIZE! |
357 | #endif |
358 | #if VEC_SIZE > 32 |
359 | cmpb $32, %dl |
360 | jae L(between_32_63) |
361 | #endif |
362 | #if VEC_SIZE > 16 |
363 | cmpb $16, %dl |
364 | jae L(between_16_31) |
365 | #endif |
366 | cmpb $8, %dl |
367 | jae L(between_8_15) |
368 | cmpb $4, %dl |
369 | jae L(between_4_7) |
370 | cmpb $1, %dl |
371 | ja L(between_2_3) |
372 | jb 1f |
373 | movzbl (%rsi), %ecx |
374 | movb %cl, (%rdi) |
375 | 1: |
376 | ret |
377 | #if VEC_SIZE > 32 |
378 | L(between_32_63): |
379 | /* From 32 to 63. No branch when size == 32. */ |
380 | VMOVU (%rsi), %YMM0 |
381 | VMOVU -32(%rsi,%rdx), %YMM1 |
382 | VMOVU %YMM0, (%rdi) |
383 | VMOVU %YMM1, -32(%rdi,%rdx) |
384 | VZEROUPPER_RETURN |
385 | #endif |
386 | #if VEC_SIZE > 16 |
387 | /* From 16 to 31. No branch when size == 16. */ |
388 | L(between_16_31): |
389 | VMOVU (%rsi), %XMM0 |
390 | VMOVU -16(%rsi,%rdx), %XMM1 |
391 | VMOVU %XMM0, (%rdi) |
392 | VMOVU %XMM1, -16(%rdi,%rdx) |
393 | VZEROUPPER_RETURN |
394 | #endif |
395 | L(between_8_15): |
396 | /* From 8 to 15. No branch when size == 8. */ |
397 | movq -8(%rsi,%rdx), %rcx |
398 | movq (%rsi), %rsi |
399 | movq %rcx, -8(%rdi,%rdx) |
400 | movq %rsi, (%rdi) |
401 | ret |
402 | L(between_4_7): |
403 | /* From 4 to 7. No branch when size == 4. */ |
404 | movl -4(%rsi,%rdx), %ecx |
405 | movl (%rsi), %esi |
406 | movl %ecx, -4(%rdi,%rdx) |
407 | movl %esi, (%rdi) |
408 | ret |
409 | L(between_2_3): |
410 | /* From 2 to 3. No branch when size == 2. */ |
411 | movzwl -2(%rsi,%rdx), %ecx |
412 | movzwl (%rsi), %esi |
413 | movw %cx, -2(%rdi,%rdx) |
414 | movw %si, (%rdi) |
415 | ret |
416 | |
417 | #if defined USE_MULTIARCH && IS_IN (libc) |
418 | L(movsb_more_2x_vec): |
419 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
420 | ja L(movsb) |
421 | #endif |
422 | L(more_2x_vec): |
423 | /* More than 2 * VEC and there may be overlap between destination |
424 | and source. */ |
425 | cmpq $(VEC_SIZE * 8), %rdx |
426 | ja L(more_8x_vec) |
427 | cmpq $(VEC_SIZE * 4), %rdx |
428 | jbe L(last_4x_vec) |
429 | /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |
430 | VMOVU (%rsi), %VEC(0) |
431 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
432 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
433 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
434 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) |
435 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) |
436 | VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) |
437 | VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) |
438 | VMOVU %VEC(0), (%rdi) |
439 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
440 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
441 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
442 | VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) |
443 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) |
444 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) |
445 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) |
446 | VZEROUPPER_RETURN |
447 | L(last_4x_vec): |
448 | /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |
449 | VMOVU (%rsi), %VEC(0) |
450 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
451 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) |
452 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) |
453 | VMOVU %VEC(0), (%rdi) |
454 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
455 | VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) |
456 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) |
457 | VZEROUPPER_RETURN |
458 | |
459 | L(more_8x_vec): |
460 | /* Check if non-temporal move candidate. */ |
461 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
462 | /* Check non-temporal store threshold. */ |
463 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
464 | ja L(large_memcpy_2x) |
465 | #endif |
466 | /* Entry if rdx is greater than non-temporal threshold but there |
467 | is overlap. */ |
468 | L(more_8x_vec_check): |
469 | cmpq %rsi, %rdi |
470 | ja L(more_8x_vec_backward) |
471 | /* Source == destination is less common. */ |
472 | je L(nop) |
473 | /* Load the first VEC and last 4 * VEC to support overlapping |
474 | addresses. */ |
475 | VMOVU (%rsi), %VEC(4) |
476 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |
477 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |
478 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |
479 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |
480 | /* Save start and stop of the destination buffer. */ |
481 | movq %rdi, %r11 |
482 | leaq -VEC_SIZE(%rdi, %rdx), %rcx |
483 | /* Align destination for aligned stores in the loop. Compute |
484 | how much destination is misaligned. */ |
485 | movq %rdi, %r8 |
486 | andq $(VEC_SIZE - 1), %r8 |
487 | /* Get the negative of offset for alignment. */ |
488 | subq $VEC_SIZE, %r8 |
489 | /* Adjust source. */ |
490 | subq %r8, %rsi |
491 | /* Adjust destination which should be aligned now. */ |
492 | subq %r8, %rdi |
493 | /* Adjust length. */ |
494 | addq %r8, %rdx |
495 | |
496 | .p2align 4 |
497 | L(loop_4x_vec_forward): |
498 | /* Copy 4 * VEC a time forward. */ |
499 | VMOVU (%rsi), %VEC(0) |
500 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
501 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
502 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
503 | subq $-(VEC_SIZE * 4), %rsi |
504 | addq $-(VEC_SIZE * 4), %rdx |
505 | VMOVA %VEC(0), (%rdi) |
506 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
507 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
508 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
509 | subq $-(VEC_SIZE * 4), %rdi |
510 | cmpq $(VEC_SIZE * 4), %rdx |
511 | ja L(loop_4x_vec_forward) |
512 | /* Store the last 4 * VEC. */ |
513 | VMOVU %VEC(5), (%rcx) |
514 | VMOVU %VEC(6), -VEC_SIZE(%rcx) |
515 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |
516 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |
517 | /* Store the first VEC. */ |
518 | VMOVU %VEC(4), (%r11) |
519 | VZEROUPPER_RETURN |
520 | |
521 | L(more_8x_vec_backward): |
522 | /* Load the first 4 * VEC and last VEC to support overlapping |
523 | addresses. */ |
524 | VMOVU (%rsi), %VEC(4) |
525 | VMOVU VEC_SIZE(%rsi), %VEC(5) |
526 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |
527 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |
528 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) |
529 | /* Save stop of the destination buffer. */ |
530 | leaq -VEC_SIZE(%rdi, %rdx), %r11 |
531 | /* Align destination end for aligned stores in the loop. Compute |
532 | how much destination end is misaligned. */ |
533 | leaq -VEC_SIZE(%rsi, %rdx), %rcx |
534 | movq %r11, %r9 |
535 | movq %r11, %r8 |
536 | andq $(VEC_SIZE - 1), %r8 |
537 | /* Adjust source. */ |
538 | subq %r8, %rcx |
539 | /* Adjust the end of destination which should be aligned now. */ |
540 | subq %r8, %r9 |
541 | /* Adjust length. */ |
542 | subq %r8, %rdx |
543 | |
544 | .p2align 4 |
545 | L(loop_4x_vec_backward): |
546 | /* Copy 4 * VEC a time backward. */ |
547 | VMOVU (%rcx), %VEC(0) |
548 | VMOVU -VEC_SIZE(%rcx), %VEC(1) |
549 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |
550 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |
551 | addq $-(VEC_SIZE * 4), %rcx |
552 | addq $-(VEC_SIZE * 4), %rdx |
553 | VMOVA %VEC(0), (%r9) |
554 | VMOVA %VEC(1), -VEC_SIZE(%r9) |
555 | VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) |
556 | VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) |
557 | addq $-(VEC_SIZE * 4), %r9 |
558 | cmpq $(VEC_SIZE * 4), %rdx |
559 | ja L(loop_4x_vec_backward) |
560 | /* Store the first 4 * VEC. */ |
561 | VMOVU %VEC(4), (%rdi) |
562 | VMOVU %VEC(5), VEC_SIZE(%rdi) |
563 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |
564 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |
565 | /* Store the last VEC. */ |
566 | VMOVU %VEC(8), (%r11) |
567 | VZEROUPPER_RETURN |
568 | |
569 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
570 | .p2align 4 |
571 | L(large_memcpy_2x): |
572 | /* Compute absolute value of difference between source and |
573 | destination. */ |
574 | movq %rdi, %r9 |
575 | subq %rsi, %r9 |
576 | movq %r9, %r8 |
577 | leaq -1(%r9), %rcx |
578 | sarq $63, %r8 |
579 | xorq %r8, %r9 |
580 | subq %r8, %r9 |
581 | /* Don't use non-temporal store if there is overlap between |
582 | destination and source since destination may be in cache when |
583 | source is loaded. */ |
584 | cmpq %r9, %rdx |
585 | ja L(more_8x_vec_check) |
586 | |
587 | /* Cache align destination. First store the first 64 bytes then |
588 | adjust alignments. */ |
589 | VMOVU (%rsi), %VEC(8) |
590 | #if VEC_SIZE < 64 |
591 | VMOVU VEC_SIZE(%rsi), %VEC(9) |
592 | #if VEC_SIZE < 32 |
593 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) |
594 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) |
595 | #endif |
596 | #endif |
597 | VMOVU %VEC(8), (%rdi) |
598 | #if VEC_SIZE < 64 |
599 | VMOVU %VEC(9), VEC_SIZE(%rdi) |
600 | #if VEC_SIZE < 32 |
601 | VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) |
602 | VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) |
603 | #endif |
604 | #endif |
605 | /* Adjust source, destination, and size. */ |
606 | movq %rdi, %r8 |
607 | andq $63, %r8 |
608 | /* Get the negative of offset for alignment. */ |
609 | subq $64, %r8 |
610 | /* Adjust source. */ |
611 | subq %r8, %rsi |
612 | /* Adjust destination which should be aligned now. */ |
613 | subq %r8, %rdi |
614 | /* Adjust length. */ |
615 | addq %r8, %rdx |
616 | |
617 | /* Test if source and destination addresses will alias. If they do |
618 | the larger pipeline in large_memcpy_4x alleviated the |
619 | performance drop. */ |
620 | testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx |
621 | jz L(large_memcpy_4x) |
622 | |
623 | movq %rdx, %r10 |
624 | shrq $LOG_4X_MEMCPY_THRESH, %r10 |
625 | cmp __x86_shared_non_temporal_threshold(%rip), %r10 |
626 | jae L(large_memcpy_4x) |
627 | |
628 | /* edx will store remainder size for copying tail. */ |
629 | andl $(PAGE_SIZE * 2 - 1), %edx |
630 | /* r10 stores outer loop counter. */ |
631 | shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 |
632 | /* Copy 4x VEC at a time from 2 pages. */ |
633 | .p2align 4 |
634 | L(loop_large_memcpy_2x_outer): |
635 | /* ecx stores inner loop counter. */ |
636 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
637 | L(loop_large_memcpy_2x_inner): |
638 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
639 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |
640 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
641 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) |
642 | /* Load vectors from rsi. */ |
643 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
644 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
645 | subq $-LARGE_LOAD_SIZE, %rsi |
646 | /* Non-temporal store vectors to rdi. */ |
647 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
648 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
649 | subq $-LARGE_LOAD_SIZE, %rdi |
650 | decl %ecx |
651 | jnz L(loop_large_memcpy_2x_inner) |
652 | addq $PAGE_SIZE, %rdi |
653 | addq $PAGE_SIZE, %rsi |
654 | decq %r10 |
655 | jne L(loop_large_memcpy_2x_outer) |
656 | sfence |
657 | |
658 | /* Check if only last 4 loads are needed. */ |
659 | cmpl $(VEC_SIZE * 4), %edx |
660 | jbe L(large_memcpy_2x_end) |
661 | |
662 | /* Handle the last 2 * PAGE_SIZE bytes. */ |
663 | L(loop_large_memcpy_2x_tail): |
664 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
665 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
666 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
667 | VMOVU (%rsi), %VEC(0) |
668 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
669 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
670 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
671 | subq $-(VEC_SIZE * 4), %rsi |
672 | addl $-(VEC_SIZE * 4), %edx |
673 | VMOVA %VEC(0), (%rdi) |
674 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
675 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
676 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
677 | subq $-(VEC_SIZE * 4), %rdi |
678 | cmpl $(VEC_SIZE * 4), %edx |
679 | ja L(loop_large_memcpy_2x_tail) |
680 | |
681 | L(large_memcpy_2x_end): |
682 | /* Store the last 4 * VEC. */ |
683 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
684 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
685 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
686 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
687 | |
688 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
689 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
690 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
691 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
692 | VZEROUPPER_RETURN |
693 | |
694 | .p2align 4 |
695 | L(large_memcpy_4x): |
696 | movq %rdx, %r10 |
697 | /* edx will store remainder size for copying tail. */ |
698 | andl $(PAGE_SIZE * 4 - 1), %edx |
699 | /* r10 stores outer loop counter. */ |
700 | shrq $(LOG_PAGE_SIZE + 2), %r10 |
701 | /* Copy 4x VEC at a time from 4 pages. */ |
702 | .p2align 4 |
703 | L(loop_large_memcpy_4x_outer): |
704 | /* ecx stores inner loop counter. */ |
705 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
706 | L(loop_large_memcpy_4x_inner): |
707 | /* Only one prefetch set per page as doing 4 pages give more time |
708 | for prefetcher to keep up. */ |
709 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
710 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
711 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) |
712 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) |
713 | /* Load vectors from rsi. */ |
714 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
715 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
716 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
717 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
718 | subq $-LARGE_LOAD_SIZE, %rsi |
719 | /* Non-temporal store vectors to rdi. */ |
720 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
721 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
722 | STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
723 | STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
724 | subq $-LARGE_LOAD_SIZE, %rdi |
725 | decl %ecx |
726 | jnz L(loop_large_memcpy_4x_inner) |
727 | addq $(PAGE_SIZE * 3), %rdi |
728 | addq $(PAGE_SIZE * 3), %rsi |
729 | decq %r10 |
730 | jne L(loop_large_memcpy_4x_outer) |
731 | sfence |
732 | /* Check if only last 4 loads are needed. */ |
733 | cmpl $(VEC_SIZE * 4), %edx |
734 | jbe L(large_memcpy_4x_end) |
735 | |
736 | /* Handle the last 4 * PAGE_SIZE bytes. */ |
737 | L(loop_large_memcpy_4x_tail): |
738 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
739 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
740 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
741 | VMOVU (%rsi), %VEC(0) |
742 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
743 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
744 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
745 | subq $-(VEC_SIZE * 4), %rsi |
746 | addl $-(VEC_SIZE * 4), %edx |
747 | VMOVA %VEC(0), (%rdi) |
748 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
749 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
750 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
751 | subq $-(VEC_SIZE * 4), %rdi |
752 | cmpl $(VEC_SIZE * 4), %edx |
753 | ja L(loop_large_memcpy_4x_tail) |
754 | |
755 | L(large_memcpy_4x_end): |
756 | /* Store the last 4 * VEC. */ |
757 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
758 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
759 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
760 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
761 | |
762 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
763 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
764 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
765 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
766 | VZEROUPPER_RETURN |
767 | #endif |
768 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
769 | |
770 | #if IS_IN (libc) |
771 | # ifdef USE_MULTIARCH |
772 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |
773 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |
774 | # ifdef SHARED |
775 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |
776 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |
777 | # endif |
778 | # endif |
779 | # ifdef SHARED |
780 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |
781 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |
782 | # endif |
783 | #endif |
784 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |
785 | MEMCPY_SYMBOL (__memcpy, unaligned)) |
786 | |