1 | /* memcpy with AVX |
2 | Copyright (C) 2014-2016 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #if IS_IN (libc) \ |
22 | && (defined SHARED \ |
23 | || defined USE_AS_MEMMOVE \ |
24 | || !defined USE_MULTIARCH) |
25 | |
26 | #include "asm-syntax.h" |
27 | #ifndef MEMCPY |
28 | # define MEMCPY __memcpy_avx_unaligned |
29 | # define MEMCPY_CHK __memcpy_chk_avx_unaligned |
30 | #endif |
31 | |
32 | .section .text.avx,"ax" ,@progbits |
33 | #if !defined USE_AS_BCOPY |
34 | ENTRY (MEMCPY_CHK) |
35 | cmpq %rdx, %rcx |
36 | jb HIDDEN_JUMPTARGET (__chk_fail) |
37 | END (MEMCPY_CHK) |
38 | #endif |
39 | |
40 | ENTRY (MEMCPY) |
41 | mov %rdi, %rax |
42 | #ifdef USE_AS_MEMPCPY |
43 | add %rdx, %rax |
44 | #endif |
45 | cmp $256, %rdx |
46 | jae L(256bytesormore) |
47 | cmp $16, %dl |
48 | jb L(less_16bytes) |
49 | cmp $128, %dl |
50 | jb L(less_128bytes) |
51 | vmovdqu (%rsi), %xmm0 |
52 | lea (%rsi, %rdx), %rcx |
53 | vmovdqu 0x10(%rsi), %xmm1 |
54 | vmovdqu 0x20(%rsi), %xmm2 |
55 | vmovdqu 0x30(%rsi), %xmm3 |
56 | vmovdqu 0x40(%rsi), %xmm4 |
57 | vmovdqu 0x50(%rsi), %xmm5 |
58 | vmovdqu 0x60(%rsi), %xmm6 |
59 | vmovdqu 0x70(%rsi), %xmm7 |
60 | vmovdqu -0x80(%rcx), %xmm8 |
61 | vmovdqu -0x70(%rcx), %xmm9 |
62 | vmovdqu -0x60(%rcx), %xmm10 |
63 | vmovdqu -0x50(%rcx), %xmm11 |
64 | vmovdqu -0x40(%rcx), %xmm12 |
65 | vmovdqu -0x30(%rcx), %xmm13 |
66 | vmovdqu -0x20(%rcx), %xmm14 |
67 | vmovdqu -0x10(%rcx), %xmm15 |
68 | lea (%rdi, %rdx), %rdx |
69 | vmovdqu %xmm0, (%rdi) |
70 | vmovdqu %xmm1, 0x10(%rdi) |
71 | vmovdqu %xmm2, 0x20(%rdi) |
72 | vmovdqu %xmm3, 0x30(%rdi) |
73 | vmovdqu %xmm4, 0x40(%rdi) |
74 | vmovdqu %xmm5, 0x50(%rdi) |
75 | vmovdqu %xmm6, 0x60(%rdi) |
76 | vmovdqu %xmm7, 0x70(%rdi) |
77 | vmovdqu %xmm8, -0x80(%rdx) |
78 | vmovdqu %xmm9, -0x70(%rdx) |
79 | vmovdqu %xmm10, -0x60(%rdx) |
80 | vmovdqu %xmm11, -0x50(%rdx) |
81 | vmovdqu %xmm12, -0x40(%rdx) |
82 | vmovdqu %xmm13, -0x30(%rdx) |
83 | vmovdqu %xmm14, -0x20(%rdx) |
84 | vmovdqu %xmm15, -0x10(%rdx) |
85 | ret |
86 | .p2align 4 |
87 | L(less_128bytes): |
88 | cmp $64, %dl |
89 | jb L(less_64bytes) |
90 | vmovdqu (%rsi), %xmm0 |
91 | lea (%rsi, %rdx), %rcx |
92 | vmovdqu 0x10(%rsi), %xmm1 |
93 | vmovdqu 0x20(%rsi), %xmm2 |
94 | lea (%rdi, %rdx), %rdx |
95 | vmovdqu 0x30(%rsi), %xmm3 |
96 | vmovdqu -0x40(%rcx), %xmm4 |
97 | vmovdqu -0x30(%rcx), %xmm5 |
98 | vmovdqu -0x20(%rcx), %xmm6 |
99 | vmovdqu -0x10(%rcx), %xmm7 |
100 | vmovdqu %xmm0, (%rdi) |
101 | vmovdqu %xmm1, 0x10(%rdi) |
102 | vmovdqu %xmm2, 0x20(%rdi) |
103 | vmovdqu %xmm3, 0x30(%rdi) |
104 | vmovdqu %xmm4, -0x40(%rdx) |
105 | vmovdqu %xmm5, -0x30(%rdx) |
106 | vmovdqu %xmm6, -0x20(%rdx) |
107 | vmovdqu %xmm7, -0x10(%rdx) |
108 | ret |
109 | |
110 | .p2align 4 |
111 | L(less_64bytes): |
112 | cmp $32, %dl |
113 | jb L(less_32bytes) |
114 | vmovdqu (%rsi), %xmm0 |
115 | vmovdqu 0x10(%rsi), %xmm1 |
116 | vmovdqu -0x20(%rsi, %rdx), %xmm6 |
117 | vmovdqu -0x10(%rsi, %rdx), %xmm7 |
118 | vmovdqu %xmm0, (%rdi) |
119 | vmovdqu %xmm1, 0x10(%rdi) |
120 | vmovdqu %xmm6, -0x20(%rdi, %rdx) |
121 | vmovdqu %xmm7, -0x10(%rdi, %rdx) |
122 | ret |
123 | |
124 | .p2align 4 |
125 | L(less_32bytes): |
126 | vmovdqu (%rsi), %xmm0 |
127 | vmovdqu -0x10(%rsi, %rdx), %xmm7 |
128 | vmovdqu %xmm0, (%rdi) |
129 | vmovdqu %xmm7, -0x10(%rdi, %rdx) |
130 | ret |
131 | |
132 | .p2align 4 |
133 | L(less_16bytes): |
134 | cmp $8, %dl |
135 | jb L(less_8bytes) |
136 | movq -0x08(%rsi, %rdx), %rcx |
137 | movq (%rsi), %rsi |
138 | movq %rsi, (%rdi) |
139 | movq %rcx, -0x08(%rdi, %rdx) |
140 | ret |
141 | |
142 | .p2align 4 |
143 | L(less_8bytes): |
144 | cmp $4, %dl |
145 | jb L(less_4bytes) |
146 | mov -0x04(%rsi, %rdx), %ecx |
147 | mov (%rsi), %esi |
148 | mov %esi, (%rdi) |
149 | mov %ecx, -0x04(%rdi, %rdx) |
150 | ret |
151 | |
152 | L(less_4bytes): |
153 | cmp $1, %dl |
154 | jbe L(less_2bytes) |
155 | mov -0x02(%rsi, %rdx), %cx |
156 | mov (%rsi), %si |
157 | mov %si, (%rdi) |
158 | mov %cx, -0x02(%rdi, %rdx) |
159 | ret |
160 | |
161 | L(less_2bytes): |
162 | jb L(less_0bytes) |
163 | mov (%rsi), %cl |
164 | mov %cl, (%rdi) |
165 | L(less_0bytes): |
166 | ret |
167 | |
168 | .p2align 4 |
169 | L(256bytesormore): |
170 | #ifdef USE_AS_MEMMOVE |
171 | mov %rdi, %rcx |
172 | sub %rsi, %rcx |
173 | cmp %rdx, %rcx |
174 | jc L(copy_backward) |
175 | #endif |
176 | cmp $2048, %rdx |
177 | jae L(gobble_data_movsb) |
178 | mov %rax, %r8 |
179 | lea (%rsi, %rdx), %rcx |
180 | mov %rdi, %r10 |
181 | vmovdqu -0x80(%rcx), %xmm5 |
182 | vmovdqu -0x70(%rcx), %xmm6 |
183 | mov $0x80, %rax |
184 | and $-32, %rdi |
185 | add $32, %rdi |
186 | vmovdqu -0x60(%rcx), %xmm7 |
187 | vmovdqu -0x50(%rcx), %xmm8 |
188 | mov %rdi, %r11 |
189 | sub %r10, %r11 |
190 | vmovdqu -0x40(%rcx), %xmm9 |
191 | vmovdqu -0x30(%rcx), %xmm10 |
192 | sub %r11, %rdx |
193 | vmovdqu -0x20(%rcx), %xmm11 |
194 | vmovdqu -0x10(%rcx), %xmm12 |
195 | vmovdqu (%rsi), %ymm4 |
196 | add %r11, %rsi |
197 | sub %eax, %edx |
198 | L(goble_128_loop): |
199 | vmovdqu (%rsi), %ymm0 |
200 | vmovdqu 0x20(%rsi), %ymm1 |
201 | vmovdqu 0x40(%rsi), %ymm2 |
202 | vmovdqu 0x60(%rsi), %ymm3 |
203 | add %rax, %rsi |
204 | vmovdqa %ymm0, (%rdi) |
205 | vmovdqa %ymm1, 0x20(%rdi) |
206 | vmovdqa %ymm2, 0x40(%rdi) |
207 | vmovdqa %ymm3, 0x60(%rdi) |
208 | add %rax, %rdi |
209 | sub %eax, %edx |
210 | jae L(goble_128_loop) |
211 | add %eax, %edx |
212 | add %rdi, %rdx |
213 | vmovdqu %ymm4, (%r10) |
214 | vzeroupper |
215 | vmovdqu %xmm5, -0x80(%rdx) |
216 | vmovdqu %xmm6, -0x70(%rdx) |
217 | vmovdqu %xmm7, -0x60(%rdx) |
218 | vmovdqu %xmm8, -0x50(%rdx) |
219 | vmovdqu %xmm9, -0x40(%rdx) |
220 | vmovdqu %xmm10, -0x30(%rdx) |
221 | vmovdqu %xmm11, -0x20(%rdx) |
222 | vmovdqu %xmm12, -0x10(%rdx) |
223 | mov %r8, %rax |
224 | ret |
225 | |
226 | .p2align 4 |
227 | L(gobble_data_movsb): |
228 | #ifdef SHARED_CACHE_SIZE_HALF |
229 | mov $SHARED_CACHE_SIZE_HALF, %rcx |
230 | #else |
231 | mov __x86_shared_cache_size_half(%rip), %rcx |
232 | #endif |
233 | shl $3, %rcx |
234 | cmp %rcx, %rdx |
235 | jae L(gobble_big_data_fwd) |
236 | mov %rdx, %rcx |
237 | mov %rdx, %rcx |
238 | rep movsb |
239 | ret |
240 | |
241 | .p2align 4 |
242 | L(gobble_big_data_fwd): |
243 | lea (%rsi, %rdx), %rcx |
244 | vmovdqu (%rsi), %ymm4 |
245 | vmovdqu -0x80(%rsi,%rdx), %xmm5 |
246 | vmovdqu -0x70(%rcx), %xmm6 |
247 | vmovdqu -0x60(%rcx), %xmm7 |
248 | vmovdqu -0x50(%rcx), %xmm8 |
249 | vmovdqu -0x40(%rcx), %xmm9 |
250 | vmovdqu -0x30(%rcx), %xmm10 |
251 | vmovdqu -0x20(%rcx), %xmm11 |
252 | vmovdqu -0x10(%rcx), %xmm12 |
253 | mov %rdi, %r8 |
254 | and $-32, %rdi |
255 | add $32, %rdi |
256 | mov %rdi, %r10 |
257 | sub %r8, %r10 |
258 | sub %r10, %rdx |
259 | add %r10, %rsi |
260 | lea (%rdi, %rdx), %rcx |
261 | add $-0x80, %rdx |
262 | L(gobble_mem_fwd_loop): |
263 | prefetchnta 0x1c0(%rsi) |
264 | prefetchnta 0x280(%rsi) |
265 | vmovdqu (%rsi), %ymm0 |
266 | vmovdqu 0x20(%rsi), %ymm1 |
267 | vmovdqu 0x40(%rsi), %ymm2 |
268 | vmovdqu 0x60(%rsi), %ymm3 |
269 | sub $-0x80, %rsi |
270 | vmovntdq %ymm0, (%rdi) |
271 | vmovntdq %ymm1, 0x20(%rdi) |
272 | vmovntdq %ymm2, 0x40(%rdi) |
273 | vmovntdq %ymm3, 0x60(%rdi) |
274 | sub $-0x80, %rdi |
275 | add $-0x80, %rdx |
276 | jb L(gobble_mem_fwd_loop) |
277 | sfence |
278 | vmovdqu %ymm4, (%r8) |
279 | vzeroupper |
280 | vmovdqu %xmm5, -0x80(%rcx) |
281 | vmovdqu %xmm6, -0x70(%rcx) |
282 | vmovdqu %xmm7, -0x60(%rcx) |
283 | vmovdqu %xmm8, -0x50(%rcx) |
284 | vmovdqu %xmm9, -0x40(%rcx) |
285 | vmovdqu %xmm10, -0x30(%rcx) |
286 | vmovdqu %xmm11, -0x20(%rcx) |
287 | vmovdqu %xmm12, -0x10(%rcx) |
288 | ret |
289 | |
290 | #ifdef USE_AS_MEMMOVE |
291 | .p2align 4 |
292 | L(copy_backward): |
293 | #ifdef SHARED_CACHE_SIZE_HALF |
294 | mov $SHARED_CACHE_SIZE_HALF, %rcx |
295 | #else |
296 | mov __x86_shared_cache_size_half(%rip), %rcx |
297 | #endif |
298 | shl $3, %rcx |
299 | vmovdqu (%rsi), %xmm5 |
300 | vmovdqu 0x10(%rsi), %xmm6 |
301 | add %rdx, %rdi |
302 | vmovdqu 0x20(%rsi), %xmm7 |
303 | vmovdqu 0x30(%rsi), %xmm8 |
304 | lea -0x20(%rdi), %r10 |
305 | mov %rdi, %r11 |
306 | vmovdqu 0x40(%rsi), %xmm9 |
307 | vmovdqu 0x50(%rsi), %xmm10 |
308 | and $0x1f, %r11 |
309 | vmovdqu 0x60(%rsi), %xmm11 |
310 | vmovdqu 0x70(%rsi), %xmm12 |
311 | xor %r11, %rdi |
312 | add %rdx, %rsi |
313 | vmovdqu -0x20(%rsi), %ymm4 |
314 | sub %r11, %rsi |
315 | sub %r11, %rdx |
316 | cmp %rcx, %rdx |
317 | ja L(gobble_big_data_bwd) |
318 | add $-0x80, %rdx |
319 | L(gobble_mem_bwd_llc): |
320 | vmovdqu -0x20(%rsi), %ymm0 |
321 | vmovdqu -0x40(%rsi), %ymm1 |
322 | vmovdqu -0x60(%rsi), %ymm2 |
323 | vmovdqu -0x80(%rsi), %ymm3 |
324 | lea -0x80(%rsi), %rsi |
325 | vmovdqa %ymm0, -0x20(%rdi) |
326 | vmovdqa %ymm1, -0x40(%rdi) |
327 | vmovdqa %ymm2, -0x60(%rdi) |
328 | vmovdqa %ymm3, -0x80(%rdi) |
329 | lea -0x80(%rdi), %rdi |
330 | add $-0x80, %rdx |
331 | jb L(gobble_mem_bwd_llc) |
332 | vmovdqu %ymm4, (%r10) |
333 | vzeroupper |
334 | vmovdqu %xmm5, (%rax) |
335 | vmovdqu %xmm6, 0x10(%rax) |
336 | vmovdqu %xmm7, 0x20(%rax) |
337 | vmovdqu %xmm8, 0x30(%rax) |
338 | vmovdqu %xmm9, 0x40(%rax) |
339 | vmovdqu %xmm10, 0x50(%rax) |
340 | vmovdqu %xmm11, 0x60(%rax) |
341 | vmovdqu %xmm12, 0x70(%rax) |
342 | ret |
343 | |
344 | .p2align 4 |
345 | L(gobble_big_data_bwd): |
346 | add $-0x80, %rdx |
347 | L(gobble_mem_bwd_loop): |
348 | prefetchnta -0x1c0(%rsi) |
349 | prefetchnta -0x280(%rsi) |
350 | vmovdqu -0x20(%rsi), %ymm0 |
351 | vmovdqu -0x40(%rsi), %ymm1 |
352 | vmovdqu -0x60(%rsi), %ymm2 |
353 | vmovdqu -0x80(%rsi), %ymm3 |
354 | lea -0x80(%rsi), %rsi |
355 | vmovntdq %ymm0, -0x20(%rdi) |
356 | vmovntdq %ymm1, -0x40(%rdi) |
357 | vmovntdq %ymm2, -0x60(%rdi) |
358 | vmovntdq %ymm3, -0x80(%rdi) |
359 | lea -0x80(%rdi), %rdi |
360 | add $-0x80, %rdx |
361 | jb L(gobble_mem_bwd_loop) |
362 | sfence |
363 | vmovdqu %ymm4, (%r10) |
364 | vzeroupper |
365 | vmovdqu %xmm5, (%rax) |
366 | vmovdqu %xmm6, 0x10(%rax) |
367 | vmovdqu %xmm7, 0x20(%rax) |
368 | vmovdqu %xmm8, 0x30(%rax) |
369 | vmovdqu %xmm9, 0x40(%rax) |
370 | vmovdqu %xmm10, 0x50(%rax) |
371 | vmovdqu %xmm11, 0x60(%rax) |
372 | vmovdqu %xmm12, 0x70(%rax) |
373 | ret |
374 | #endif |
375 | END (MEMCPY) |
376 | #endif |
377 | |