memcpy.S source code [glibc/sysdeps/x86_64/memcpy.S]

1	/*
2	Optimized memcpy for x86-64.
3
4	Copyright (C) 2007-2016 Free Software Foundation, Inc.
5	Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
6
7	This file is part of the GNU C Library.
8
9	The GNU C Library is free software; you can redistribute it and/or
10	modify it under the terms of the GNU Lesser General Public
11	License as published by the Free Software Foundation; either
12	version 2.1 of the License, or (at your option) any later version.
13
14	The GNU C Library is distributed in the hope that it will be useful,
15	but WITHOUT ANY WARRANTY; without even the implied warranty of
16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	Lesser General Public License for more details.
18
19	You should have received a copy of the GNU Lesser General Public
20	License along with the GNU C Library; if not, see
21	<http://www.gnu.org/licenses/>.
22	*/
23
24	#include <sysdep.h>
25	#include "asm-syntax.h"
26
27	/ Stack slots in the red-zone. /
28
29	#ifdef USE_AS_MEMPCPY
30	# define RETVAL (0)
31	#else
32	# define RETVAL (-8)
33	# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
34	# define memcpy __memcpy
35	# undef libc_hidden_builtin_def
36	# define libc_hidden_builtin_def(name) \
37	.globl __GI_memcpy; __GI_memcpy = __memcpy
38	# endif
39	#endif
40	#define SAVE0 (RETVAL - 8)
41	#define SAVE1 (SAVE0 - 8)
42	#define SAVE2 (SAVE1 - 8)
43	#define SAVE3 (SAVE2 - 8)
44
45	.text
46
47	#if defined PIC && IS_IN (libc)
48	ENTRY_CHK (__memcpy_chk)
49
50	cmpq %rdx, %rcx
51	jb HIDDEN_JUMPTARGET (__chk_fail)
52
53	END_CHK (__memcpy_chk)
54	#endif
55
56	ENTRY(memcpy) / (void , const void, size_t) /
57
58	/ Handle tiny blocks. /
59
60	L(`1try`): / up to 32B /
61	cmpq $`32`, %rdx
62	#ifndef USE_AS_MEMPCPY
63	movq %rdi, %rax / save return value /
64	#endif
65	jae L(`1after`)
66
67	L(`1`): / 1-byte once /
68	testb $`1`, %dl
69	jz L(`1a`)
70
71	movzbl (%rsi), %ecx
72	movb %cl, (%rdi)
73
74	incq %rsi
75	incq %rdi
76
77	.p2align `4`,, `4`
78
79	L(`1a`): / 2-byte once /
80	testb $`2`, %dl
81	jz L(`1b`)
82
83	movzwl (%rsi), %ecx
84	movw %cx, (%rdi)
85
86	addq $`2`, %rsi
87	addq $`2`, %rdi
88
89	.p2align `4`,, `4`
90
91	L(`1b`): / 4-byte once /
92	testb $`4`, %dl
93	jz L(`1c`)
94
95	movl (%rsi), %ecx
96	movl %ecx, (%rdi)
97
98	addq $`4`, %rsi
99	addq $`4`, %rdi
100
101	.p2align `4`,, `4`
102
103	L(`1c`): / 8-byte once /
104	testb $`8`, %dl
105	jz L(`1d`)
106
107	movq (%rsi), %rcx
108	movq %rcx, (%rdi)
109
110	addq $`8`, %rsi
111	addq $`8`, %rdi
112
113	.p2align `4`,, `4`
114
115	L(`1d`): / 16-byte loop /
116	andl $`0xf0`, %edx
117	jz L(exit)
118
119	.p2align `4`
120
121	L(`1loop`):
122	movq (%rsi), %rcx
123	movq `8`(%rsi), %r8
124	movq %rcx, (%rdi)
125	movq %r8, `8`(%rdi)
126
127	subl $`16`, %edx
128
129	leaq `16`(%rsi), %rsi
130	leaq `16`(%rdi), %rdi
131
132	jnz L(`1loop`)
133
134	.p2align `4`,, `4`
135
136	L(exit): / exit /
137	#ifdef USE_AS_MEMPCPY
138	movq %rdi, %rax / return value /
139	#else
140	rep
141	#endif
142	retq
143
144	.p2align `4`
145
146	L(`1after`):
147	#ifndef USE_AS_MEMPCPY
148	movq %rax, RETVAL(%rsp) / save return value /
149	#endif
150
151	/ Align to the natural word size. /
152
153	L(aligntry):
154	movl %esi, %ecx / align by source /
155
156	andl $`7`, %ecx
157	jz L(alignafter) / already aligned /
158
159	L(align): / align /
160	leaq -`8`(%rcx, %rdx), %rdx / calculate remaining bytes /
161	subl $`8`, %ecx
162
163	.p2align `4`
164
165	L(alignloop): / 1-byte alignment loop /
166	movzbl (%rsi), %eax
167	movb %al, (%rdi)
168
169	incl %ecx
170
171	leaq `1`(%rsi), %rsi
172	leaq `1`(%rdi), %rdi
173
174	jnz L(alignloop)
175
176	.p2align `4`
177
178	L(alignafter):
179
180	/ Handle mid-sized blocks. /
181
182	L(`32try`): / up to 1KB /
183	cmpq $`1024`, %rdx
184	ja L(`32after`)
185
186	L(`32`): / 32-byte loop /
187	movl %edx, %ecx
188	shrl $`5`, %ecx
189	jz L(`32skip`)
190
191	.p2align `4`
192
193	L(`32loop`):
194	decl %ecx
195
196	movq (%rsi), %rax
197	movq `8`(%rsi), %r8
198	movq `16`(%rsi), %r9
199	movq `24`(%rsi), %r10
200
201	movq %rax, (%rdi)
202	movq %r8, `8`(%rdi)
203	movq %r9, `16`(%rdi)
204	movq %r10, `24`(%rdi)
205
206	leaq `32`(%rsi), %rsi
207	leaq `32`(%rdi), %rdi
208
209	jz L(`32skip`) / help out smaller blocks /
210
211	decl %ecx
212
213	movq (%rsi), %rax
214	movq `8`(%rsi), %r8
215	movq `16`(%rsi), %r9
216	movq `24`(%rsi), %r10
217
218	movq %rax, (%rdi)
219	movq %r8, `8`(%rdi)
220	movq %r9, `16`(%rdi)
221	movq %r10, `24`(%rdi)
222
223	leaq `32`(%rsi), %rsi
224	leaq `32`(%rdi), %rdi
225
226	jnz L(`32loop`)
227
228	.p2align `4`
229
230	L(`32skip`):
231	andl $`31`, %edx / check for left overs /
232	#ifdef USE_AS_MEMPCPY
233	jnz L(`1`)
234
235	movq %rdi, %rax
236	#else
237	movq RETVAL(%rsp), %rax
238	jnz L(`1`)
239
240	rep
241	#endif
242	retq / exit /
243
244	.p2align `4`
245
246	L(`32after`):
247
248	/*
249	In order to minimize code-size in RTLD, algorithms specific for
250	larger blocks are excluded when building for RTLD.
251	*/
252
253	/ Handle blocks smaller than 1/2 L1. /
254
255	L(fasttry): / first 1/2 L1 /
256	#if IS_IN (libc) /* only up to this algorithm outside of libc.so */
257	mov __x86_data_cache_size_half(%rip), %R11_LP
258	cmpq %rdx, %r11 / calculate the smaller of /
259	cmovaq %rdx, %r11 / remaining bytes and 1/2 L1 /
260	#endif
261
262	L(fast): / good ol' MOVS /
263	#if IS_IN (libc)
264	movq %r11, %rcx
265	andq $-`8`, %r11
266	#else
267	movq %rdx, %rcx
268	#endif
269	shrq $`3`, %rcx
270	jz L(fastskip)
271
272	rep
273	movsq
274
275	.p2align `4`,, `4`
276
277	L(fastskip):
278	#if IS_IN (libc)
279	subq %r11, %rdx / check for more /
280	testq $-`8`, %rdx
281	jnz L(fastafter)
282	#endif
283
284	andl $`7`, %edx / check for left overs /
285	#ifdef USE_AS_MEMPCPY
286	jnz L(`1`)
287
288	movq %rdi, %rax
289	#else
290	movq RETVAL(%rsp), %rax
291	jnz L(`1`)
292
293	rep
294	#endif
295	retq / exit /
296
297	#if IS_IN (libc) /* none of the algorithms below for RTLD */
298
299	.p2align `4`
300
301	L(fastafter):
302
303	/ Handle large blocks smaller than 1/2 L2. /
304
305	L(pretry): / first 1/2 L2 /
306	mov __x86_shared_cache_size_half (%rip), %R8_LP
307	cmpq %rdx, %r8 / calculate the lesser of /
308	cmovaq %rdx, %r8 / remaining bytes and 1/2 L2 /
309
310	L(pre): / 64-byte with prefetching /
311	movq %r8, %rcx
312	andq $-`64`, %r8
313	shrq $`6`, %rcx
314	jz L(preskip)
315
316	movq %r14, SAVE0(%rsp)
317	cfi_rel_offset (%r14, SAVE0)
318	movq %r13, SAVE1(%rsp)
319	cfi_rel_offset (%r13, SAVE1)
320	movq %r12, SAVE2(%rsp)
321	cfi_rel_offset (%r12, SAVE2)
322	movq %rbx, SAVE3(%rsp)
323	cfi_rel_offset (%rbx, SAVE3)
324
325	cmpl $`0`, __x86_prefetchw(%rip)
326	jz L(preloop) / check if PREFETCHW OK /
327
328	.p2align `4`
329
330	/ ... when PREFETCHW is available (less cache-probe traffic in MP systems). /
331
332	L(prewloop): / cache-line in state M /
333	decq %rcx
334
335	movq (%rsi), %rax
336	movq `8` (%rsi), %rbx
337	movq `16` (%rsi), %r9
338	movq `24` (%rsi), %r10
339	movq `32` (%rsi), %r11
340	movq `40` (%rsi), %r12
341	movq `48` (%rsi), %r13
342	movq `56` (%rsi), %r14
343
344	prefetcht0 `0` + `896` (%rsi)
345	prefetcht0 `64` + `896` (%rsi)
346
347	movq %rax, (%rdi)
348	movq %rbx, `8`(%rdi)
349	movq %r9, `16`(%rdi)
350	movq %r10, `24`(%rdi)
351	movq %r11, `32`(%rdi)
352	movq %r12, `40`(%rdi)
353	movq %r13, `48`(%rdi)
354	movq %r14, `56`(%rdi)
355
356	leaq `64`(%rsi), %rsi
357	leaq `64`(%rdi), %rdi
358
359	jz L(prebail)
360
361	decq %rcx
362
363	movq (%rsi), %rax
364	movq `8`(%rsi), %rbx
365	movq `16`(%rsi), %r9
366	movq `24`(%rsi), %r10
367	movq `32`(%rsi), %r11
368	movq `40`(%rsi), %r12
369	movq `48`(%rsi), %r13
370	movq `56`(%rsi), %r14
371
372	movq %rax, (%rdi)
373	movq %rbx, `8`(%rdi)
374	movq %r9, `16`(%rdi)
375	movq %r10, `24`(%rdi)
376	movq %r11, `32`(%rdi)
377	movq %r12, `40`(%rdi)
378	movq %r13, `48`(%rdi)
379	movq %r14, `56`(%rdi)
380
381	prefetchw `896` - `64`(%rdi)
382	prefetchw `896` - `0`(%rdi)
383
384	leaq `64`(%rsi), %rsi
385	leaq `64`(%rdi), %rdi
386
387	jnz L(prewloop)
388	jmp L(prebail)
389
390	.p2align `4`
391
392	/ ... when PREFETCHW is not available. /
393
394	L(preloop): / cache-line in state E /
395	decq %rcx
396
397	movq (%rsi), %rax
398	movq `8`(%rsi), %rbx
399	movq `16`(%rsi), %r9
400	movq `24`(%rsi), %r10
401	movq `32`(%rsi), %r11
402	movq `40`(%rsi), %r12
403	movq `48`(%rsi), %r13
404	movq `56`(%rsi), %r14
405
406	prefetcht0 `896` + `0`(%rsi)
407	prefetcht0 `896` + `64`(%rsi)
408
409	movq %rax, (%rdi)
410	movq %rbx, `8`(%rdi)
411	movq %r9, `16`(%rdi)
412	movq %r10, `24`(%rdi)
413	movq %r11, `32`(%rdi)
414	movq %r12, `40`(%rdi)
415	movq %r13, `48`(%rdi)
416	movq %r14, `56`(%rdi)
417
418	leaq `64` (%rsi), %rsi
419	leaq `64` (%rdi), %rdi
420
421	jz L(prebail)
422
423	decq %rcx
424
425	movq (%rsi), %rax
426	movq `8`(%rsi), %rbx
427	movq `16`(%rsi), %r9
428	movq `24`(%rsi), %r10
429	movq `32`(%rsi), %r11
430	movq `40`(%rsi), %r12
431	movq `48`(%rsi), %r13
432	movq `56`(%rsi), %r14
433
434	prefetcht0 `896` - `64`(%rdi)
435	prefetcht0 `896` - `0`(%rdi)
436
437	movq %rax, (%rdi)
438	movq %rbx, `8`(%rdi)
439	movq %r9, `16`(%rdi)
440	movq %r10, `24`(%rdi)
441	movq %r11, `32`(%rdi)
442	movq %r12, `40`(%rdi)
443	movq %r13, `48`(%rdi)
444	movq %r14, `56`(%rdi)
445
446	leaq `64`(%rsi), %rsi
447	leaq `64`(%rdi), %rdi
448
449	jnz L(preloop)
450
451	L(prebail):
452	movq SAVE3(%rsp), %rbx
453	cfi_restore (%rbx)
454	movq SAVE2(%rsp), %r12
455	cfi_restore (%r12)
456	movq SAVE1(%rsp), %r13
457	cfi_restore (%r13)
458	movq SAVE0(%rsp), %r14
459	cfi_restore (%r14)
460
461	/ .p2align 4 /
462
463	L(preskip):
464	subq %r8, %rdx / check for more /
465	testq $-`64`, %rdx
466	jnz L(preafter)
467
468	andl $`63`, %edx / check for left overs /
469	#ifdef USE_AS_MEMPCPY
470	jnz L(`1`)
471
472	movq %rdi, %rax
473	#else
474	movq RETVAL(%rsp), %rax
475	jnz L(`1`)
476
477	rep
478	#endif
479	retq / exit /
480
481	.p2align `4`
482
483	L(preafter):
484
485	/ Handle huge blocks. /
486
487	L(NTtry):
488
489	L(NT): / non-temporal 128-byte /
490	movq %rdx, %rcx
491	shrq $`7`, %rcx
492	jz L(NTskip)
493
494	movq %r14, SAVE0(%rsp)
495	cfi_rel_offset (%r14, SAVE0)
496	movq %r13, SAVE1(%rsp)
497	cfi_rel_offset (%r13, SAVE1)
498	movq %r12, SAVE2(%rsp)
499	cfi_rel_offset (%r12, SAVE2)
500
501	.p2align `4`
502
503	L(NTloop):
504	prefetchnta `768`(%rsi)
505	prefetchnta `832`(%rsi)
506
507	decq %rcx
508
509	movq (%rsi), %rax
510	movq `8`(%rsi), %r8
511	movq `16`(%rsi), %r9
512	movq `24`(%rsi), %r10
513	movq `32`(%rsi), %r11
514	movq `40`(%rsi), %r12
515	movq `48`(%rsi), %r13
516	movq `56`(%rsi), %r14
517
518	movntiq %rax, (%rdi)
519	movntiq %r8, `8`(%rdi)
520	movntiq %r9, `16`(%rdi)
521	movntiq %r10, `24`(%rdi)
522	movntiq %r11, `32`(%rdi)
523	movntiq %r12, `40`(%rdi)
524	movntiq %r13, `48`(%rdi)
525	movntiq %r14, `56`(%rdi)
526
527	movq `64`(%rsi), %rax
528	movq `72`(%rsi), %r8
529	movq `80`(%rsi), %r9
530	movq `88`(%rsi), %r10
531	movq `96`(%rsi), %r11
532	movq `104`(%rsi), %r12
533	movq `112`(%rsi), %r13
534	movq `120`(%rsi), %r14
535
536	movntiq %rax, `64`(%rdi)
537	movntiq %r8, `72`(%rdi)
538	movntiq %r9, `80`(%rdi)
539	movntiq %r10, `88`(%rdi)
540	movntiq %r11, `96`(%rdi)
541	movntiq %r12, `104`(%rdi)
542	movntiq %r13, `112`(%rdi)
543	movntiq %r14, `120`(%rdi)
544
545	leaq `128`(%rsi), %rsi
546	leaq `128`(%rdi), %rdi
547
548	jnz L(NTloop)
549
550	sfence / serialize memory stores /
551
552	movq SAVE2(%rsp), %r12
553	cfi_restore (%r12)
554	movq SAVE1(%rsp), %r13
555	cfi_restore (%r13)
556	movq SAVE0(%rsp), %r14
557	cfi_restore (%r14)
558
559	L(NTskip):
560	andl $`127`, %edx / check for left overs /
561	#ifdef USE_AS_MEMPCPY
562	jnz L(`1`)
563
564	movq %rdi, %rax
565	#else
566	movq RETVAL(%rsp), %rax
567	jnz L(`1`)
568
569	rep
570	#endif
571	retq / exit /
572
573	#endif /* IS_IN (libc) */
574
575	END(memcpy)
576
577	#ifndef USE_AS_MEMPCPY
578	libc_hidden_builtin_def (memcpy)
579	# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
580	# undef memcpy
581	# include <shlib-compat.h>
582	versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
583	# endif
584	#endif
585

Browse the source code of glibc/sysdeps/x86_64/memcpy.S