memcpy-sse2-unaligned.S source code [glibc/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S]

1	/ memcpy with unaliged loads*
2	Copyright (C) 2013-2016 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	#include <sysdep.h>
22
23	#include "asm-syntax.h"
24
25
26	ENTRY(__memcpy_sse2_unaligned)
27	movq %rsi, %rax
28	leaq (%rdx,%rdx), %rcx
29	subq %rdi, %rax
30	subq %rdx, %rax
31	cmpq %rcx, %rax
32	jb L(overlapping)
33	cmpq $`16`, %rdx
34	jbe L(less_16)
35	movdqu (%rsi), %xmm8
36	cmpq $`32`, %rdx
37	movdqu %xmm8, (%rdi)
38	movdqu -`16`(%rsi,%rdx), %xmm8
39	movdqu %xmm8, -`16`(%rdi,%rdx)
40	ja .L31
41	L(return):
42	movq %rdi, %rax
43	ret
44	.p2align `4`,,`10`
45	.p2align `4`
46	.L31:
47	movdqu `16`(%rsi), %xmm8
48	cmpq $`64`, %rdx
49	movdqu %xmm8, `16`(%rdi)
50	movdqu -`32`(%rsi,%rdx), %xmm8
51	movdqu %xmm8, -`32`(%rdi,%rdx)
52	jbe L(return)
53	movdqu `32`(%rsi), %xmm8
54	cmpq $`128`, %rdx
55	movdqu %xmm8, `32`(%rdi)
56	movdqu -`48`(%rsi,%rdx), %xmm8
57	movdqu %xmm8, -`48`(%rdi,%rdx)
58	movdqu `48`(%rsi), %xmm8
59	movdqu %xmm8, `48`(%rdi)
60	movdqu -`64`(%rsi,%rdx), %xmm8
61	movdqu %xmm8, -`64`(%rdi,%rdx)
62	jbe L(return)
63	leaq `64`(%rdi), %rcx
64	addq %rdi, %rdx
65	andq $-`64`, %rdx
66	andq $-`64`, %rcx
67	movq %rcx, %rax
68	subq %rdi, %rax
69	addq %rax, %rsi
70	cmpq %rdx, %rcx
71	je L(return)
72	movq %rsi, %r10
73	subq %rcx, %r10
74	leaq `16`(%r10), %r9
75	leaq `32`(%r10), %r8
76	leaq `48`(%r10), %rax
77	.p2align `4`,,`10`
78	.p2align `4`
79	L(loop):
80	movdqu (%rcx,%r10), %xmm8
81	movdqa %xmm8, (%rcx)
82	movdqu (%rcx,%r9), %xmm8
83	movdqa %xmm8, `16`(%rcx)
84	movdqu (%rcx,%r8), %xmm8
85	movdqa %xmm8, `32`(%rcx)
86	movdqu (%rcx,%rax), %xmm8
87	movdqa %xmm8, `48`(%rcx)
88	addq $`64`, %rcx
89	cmpq %rcx, %rdx
90	jne L(loop)
91	jmp L(return)
92	L(overlapping):
93	cmpq %rsi, %rdi
94	jae .L3
95	testq %rdx, %rdx
96	.p2align `4`,,`5`
97	je L(return)
98	movq %rdx, %r9
99	leaq `16`(%rsi), %rcx
100	leaq `16`(%rdi), %r8
101	shrq $`4`, %r9
102	movq %r9, %rax
103	salq $`4`, %rax
104	cmpq %rcx, %rdi
105	setae %cl
106	cmpq %r8, %rsi
107	setae %r8b
108	orl %r8d, %ecx
109	cmpq $`15`, %rdx
110	seta %r8b
111	testb %r8b, %cl
112	je .L16
113	testq %rax, %rax
114	je .L16
115	xorl %ecx, %ecx
116	xorl %r8d, %r8d
117	.L7:
118	movdqu (%rsi,%rcx), %xmm8
119	addq $`1`, %r8
120	movdqu %xmm8, (%rdi,%rcx)
121	addq $`16`, %rcx
122	cmpq %r8, %r9
123	ja .L7
124	cmpq %rax, %rdx
125	je L(return)
126	.L21:
127	movzbl (%rsi,%rax), %ecx
128	movb %cl, (%rdi,%rax)
129	addq $`1`, %rax
130	cmpq %rax, %rdx
131	ja .L21
132	jmp L(return)
133	L(less_16):
134	testb $`24`, %dl
135	jne L(between_9_16)
136	testb $`4`, %dl
137	.p2align `4`,,`5`
138	jne L(between_5_8)
139	testq %rdx, %rdx
140	.p2align `4`,,`2`
141	je L(return)
142	movzbl (%rsi), %eax
143	testb $`2`, %dl
144	movb %al, (%rdi)
145	je L(return)
146	movzwl -`2`(%rsi,%rdx), %eax
147	movw %ax, -`2`(%rdi,%rdx)
148	jmp L(return)
149	.L3:
150	leaq -`1`(%rdx), %rax
151	.p2align `4`,,`10`
152	.p2align `4`
153	.L11:
154	movzbl (%rsi,%rax), %edx
155	movb %dl, (%rdi,%rax)
156	subq $`1`, %rax
157	jmp .L11
158	L(between_9_16):
159	movq (%rsi), %rax
160	movq %rax, (%rdi)
161	movq -`8`(%rsi,%rdx), %rax
162	movq %rax, -`8`(%rdi,%rdx)
163	jmp L(return)
164	.L16:
165	xorl %eax, %eax
166	jmp .L21
167	L(between_5_8):
168	movl (%rsi), %eax
169	movl %eax, (%rdi)
170	movl -`4`(%rsi,%rdx), %eax
171	movl %eax, -`4`(%rdi,%rdx)
172	jmp L(return)
173	END(__memcpy_sse2_unaligned)
174
175	#endif
176

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S