1/* memcpy with unaliged loads
2 Copyright (C) 2013-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21#include <sysdep.h>
22
23#include "asm-syntax.h"
24
25
26ENTRY(__memcpy_sse2_unaligned)
27 movq %rsi, %rax
28 leaq (%rdx,%rdx), %rcx
29 subq %rdi, %rax
30 subq %rdx, %rax
31 cmpq %rcx, %rax
32 jb L(overlapping)
33 cmpq $16, %rdx
34 jbe L(less_16)
35 movdqu (%rsi), %xmm8
36 cmpq $32, %rdx
37 movdqu %xmm8, (%rdi)
38 movdqu -16(%rsi,%rdx), %xmm8
39 movdqu %xmm8, -16(%rdi,%rdx)
40 ja .L31
41L(return):
42 movq %rdi, %rax
43 ret
44 .p2align 4,,10
45 .p2align 4
46.L31:
47 movdqu 16(%rsi), %xmm8
48 cmpq $64, %rdx
49 movdqu %xmm8, 16(%rdi)
50 movdqu -32(%rsi,%rdx), %xmm8
51 movdqu %xmm8, -32(%rdi,%rdx)
52 jbe L(return)
53 movdqu 32(%rsi), %xmm8
54 cmpq $128, %rdx
55 movdqu %xmm8, 32(%rdi)
56 movdqu -48(%rsi,%rdx), %xmm8
57 movdqu %xmm8, -48(%rdi,%rdx)
58 movdqu 48(%rsi), %xmm8
59 movdqu %xmm8, 48(%rdi)
60 movdqu -64(%rsi,%rdx), %xmm8
61 movdqu %xmm8, -64(%rdi,%rdx)
62 jbe L(return)
63 leaq 64(%rdi), %rcx
64 addq %rdi, %rdx
65 andq $-64, %rdx
66 andq $-64, %rcx
67 movq %rcx, %rax
68 subq %rdi, %rax
69 addq %rax, %rsi
70 cmpq %rdx, %rcx
71 je L(return)
72 movq %rsi, %r10
73 subq %rcx, %r10
74 leaq 16(%r10), %r9
75 leaq 32(%r10), %r8
76 leaq 48(%r10), %rax
77 .p2align 4,,10
78 .p2align 4
79L(loop):
80 movdqu (%rcx,%r10), %xmm8
81 movdqa %xmm8, (%rcx)
82 movdqu (%rcx,%r9), %xmm8
83 movdqa %xmm8, 16(%rcx)
84 movdqu (%rcx,%r8), %xmm8
85 movdqa %xmm8, 32(%rcx)
86 movdqu (%rcx,%rax), %xmm8
87 movdqa %xmm8, 48(%rcx)
88 addq $64, %rcx
89 cmpq %rcx, %rdx
90 jne L(loop)
91 jmp L(return)
92L(overlapping):
93 cmpq %rsi, %rdi
94 jae .L3
95 testq %rdx, %rdx
96 .p2align 4,,5
97 je L(return)
98 movq %rdx, %r9
99 leaq 16(%rsi), %rcx
100 leaq 16(%rdi), %r8
101 shrq $4, %r9
102 movq %r9, %rax
103 salq $4, %rax
104 cmpq %rcx, %rdi
105 setae %cl
106 cmpq %r8, %rsi
107 setae %r8b
108 orl %r8d, %ecx
109 cmpq $15, %rdx
110 seta %r8b
111 testb %r8b, %cl
112 je .L16
113 testq %rax, %rax
114 je .L16
115 xorl %ecx, %ecx
116 xorl %r8d, %r8d
117.L7:
118 movdqu (%rsi,%rcx), %xmm8
119 addq $1, %r8
120 movdqu %xmm8, (%rdi,%rcx)
121 addq $16, %rcx
122 cmpq %r8, %r9
123 ja .L7
124 cmpq %rax, %rdx
125 je L(return)
126.L21:
127 movzbl (%rsi,%rax), %ecx
128 movb %cl, (%rdi,%rax)
129 addq $1, %rax
130 cmpq %rax, %rdx
131 ja .L21
132 jmp L(return)
133L(less_16):
134 testb $24, %dl
135 jne L(between_9_16)
136 testb $4, %dl
137 .p2align 4,,5
138 jne L(between_5_8)
139 testq %rdx, %rdx
140 .p2align 4,,2
141 je L(return)
142 movzbl (%rsi), %eax
143 testb $2, %dl
144 movb %al, (%rdi)
145 je L(return)
146 movzwl -2(%rsi,%rdx), %eax
147 movw %ax, -2(%rdi,%rdx)
148 jmp L(return)
149.L3:
150 leaq -1(%rdx), %rax
151 .p2align 4,,10
152 .p2align 4
153.L11:
154 movzbl (%rsi,%rax), %edx
155 movb %dl, (%rdi,%rax)
156 subq $1, %rax
157 jmp .L11
158L(between_9_16):
159 movq (%rsi), %rax
160 movq %rax, (%rdi)
161 movq -8(%rsi,%rdx), %rax
162 movq %rax, -8(%rdi,%rdx)
163 jmp L(return)
164.L16:
165 xorl %eax, %eax
166 jmp .L21
167L(between_5_8):
168 movl (%rsi), %eax
169 movl %eax, (%rdi)
170 movl -4(%rsi,%rdx), %eax
171 movl %eax, -4(%rdi,%rdx)
172 jmp L(return)
173END(__memcpy_sse2_unaligned)
174
175#endif
176