1 | /* memcpy with unaliged loads |
2 | Copyright (C) 2013-2016 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | #include <sysdep.h> |
22 | |
23 | #include "asm-syntax.h" |
24 | |
25 | |
26 | ENTRY(__memcpy_sse2_unaligned) |
27 | movq %rsi, %rax |
28 | leaq (%rdx,%rdx), %rcx |
29 | subq %rdi, %rax |
30 | subq %rdx, %rax |
31 | cmpq %rcx, %rax |
32 | jb L(overlapping) |
33 | cmpq $16, %rdx |
34 | jbe L(less_16) |
35 | movdqu (%rsi), %xmm8 |
36 | cmpq $32, %rdx |
37 | movdqu %xmm8, (%rdi) |
38 | movdqu -16(%rsi,%rdx), %xmm8 |
39 | movdqu %xmm8, -16(%rdi,%rdx) |
40 | ja .L31 |
41 | L(return): |
42 | movq %rdi, %rax |
43 | ret |
44 | .p2align 4,,10 |
45 | .p2align 4 |
46 | .L31: |
47 | movdqu 16(%rsi), %xmm8 |
48 | cmpq $64, %rdx |
49 | movdqu %xmm8, 16(%rdi) |
50 | movdqu -32(%rsi,%rdx), %xmm8 |
51 | movdqu %xmm8, -32(%rdi,%rdx) |
52 | jbe L(return) |
53 | movdqu 32(%rsi), %xmm8 |
54 | cmpq $128, %rdx |
55 | movdqu %xmm8, 32(%rdi) |
56 | movdqu -48(%rsi,%rdx), %xmm8 |
57 | movdqu %xmm8, -48(%rdi,%rdx) |
58 | movdqu 48(%rsi), %xmm8 |
59 | movdqu %xmm8, 48(%rdi) |
60 | movdqu -64(%rsi,%rdx), %xmm8 |
61 | movdqu %xmm8, -64(%rdi,%rdx) |
62 | jbe L(return) |
63 | leaq 64(%rdi), %rcx |
64 | addq %rdi, %rdx |
65 | andq $-64, %rdx |
66 | andq $-64, %rcx |
67 | movq %rcx, %rax |
68 | subq %rdi, %rax |
69 | addq %rax, %rsi |
70 | cmpq %rdx, %rcx |
71 | je L(return) |
72 | movq %rsi, %r10 |
73 | subq %rcx, %r10 |
74 | leaq 16(%r10), %r9 |
75 | leaq 32(%r10), %r8 |
76 | leaq 48(%r10), %rax |
77 | .p2align 4,,10 |
78 | .p2align 4 |
79 | L(loop): |
80 | movdqu (%rcx,%r10), %xmm8 |
81 | movdqa %xmm8, (%rcx) |
82 | movdqu (%rcx,%r9), %xmm8 |
83 | movdqa %xmm8, 16(%rcx) |
84 | movdqu (%rcx,%r8), %xmm8 |
85 | movdqa %xmm8, 32(%rcx) |
86 | movdqu (%rcx,%rax), %xmm8 |
87 | movdqa %xmm8, 48(%rcx) |
88 | addq $64, %rcx |
89 | cmpq %rcx, %rdx |
90 | jne L(loop) |
91 | jmp L(return) |
92 | L(overlapping): |
93 | cmpq %rsi, %rdi |
94 | jae .L3 |
95 | testq %rdx, %rdx |
96 | .p2align 4,,5 |
97 | je L(return) |
98 | movq %rdx, %r9 |
99 | leaq 16(%rsi), %rcx |
100 | leaq 16(%rdi), %r8 |
101 | shrq $4, %r9 |
102 | movq %r9, %rax |
103 | salq $4, %rax |
104 | cmpq %rcx, %rdi |
105 | setae %cl |
106 | cmpq %r8, %rsi |
107 | setae %r8b |
108 | orl %r8d, %ecx |
109 | cmpq $15, %rdx |
110 | seta %r8b |
111 | testb %r8b, %cl |
112 | je .L16 |
113 | testq %rax, %rax |
114 | je .L16 |
115 | xorl %ecx, %ecx |
116 | xorl %r8d, %r8d |
117 | .L7: |
118 | movdqu (%rsi,%rcx), %xmm8 |
119 | addq $1, %r8 |
120 | movdqu %xmm8, (%rdi,%rcx) |
121 | addq $16, %rcx |
122 | cmpq %r8, %r9 |
123 | ja .L7 |
124 | cmpq %rax, %rdx |
125 | je L(return) |
126 | .L21: |
127 | movzbl (%rsi,%rax), %ecx |
128 | movb %cl, (%rdi,%rax) |
129 | addq $1, %rax |
130 | cmpq %rax, %rdx |
131 | ja .L21 |
132 | jmp L(return) |
133 | L(less_16): |
134 | testb $24, %dl |
135 | jne L(between_9_16) |
136 | testb $4, %dl |
137 | .p2align 4,,5 |
138 | jne L(between_5_8) |
139 | testq %rdx, %rdx |
140 | .p2align 4,,2 |
141 | je L(return) |
142 | movzbl (%rsi), %eax |
143 | testb $2, %dl |
144 | movb %al, (%rdi) |
145 | je L(return) |
146 | movzwl -2(%rsi,%rdx), %eax |
147 | movw %ax, -2(%rdi,%rdx) |
148 | jmp L(return) |
149 | .L3: |
150 | leaq -1(%rdx), %rax |
151 | .p2align 4,,10 |
152 | .p2align 4 |
153 | .L11: |
154 | movzbl (%rsi,%rax), %edx |
155 | movb %dl, (%rdi,%rax) |
156 | subq $1, %rax |
157 | jmp .L11 |
158 | L(between_9_16): |
159 | movq (%rsi), %rax |
160 | movq %rax, (%rdi) |
161 | movq -8(%rsi,%rdx), %rax |
162 | movq %rax, -8(%rdi,%rdx) |
163 | jmp L(return) |
164 | .L16: |
165 | xorl %eax, %eax |
166 | jmp .L21 |
167 | L(between_5_8): |
168 | movl (%rsi), %eax |
169 | movl %eax, (%rdi) |
170 | movl -4(%rsi,%rdx), %eax |
171 | movl %eax, -4(%rdi,%rdx) |
172 | jmp L(return) |
173 | END(__memcpy_sse2_unaligned) |
174 | |
175 | #endif |
176 | |