1 | /* memmove/memcpy/mempcpy optimized for aligned access with SSSE3. |
2 | All versions must be listed in ifunc-impl-list.c. |
3 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | |
21 | #include <isa-level.h> |
22 | |
23 | #if ISA_SHOULD_BUILD (2) |
24 | |
25 | # include <sysdep.h> |
26 | # ifndef MEMMOVE |
27 | # define MEMMOVE __memmove_ssse3 |
28 | # define MEMMOVE_CHK __memmove_chk_ssse3 |
29 | # define MEMCPY __memcpy_ssse3 |
30 | # define MEMCPY_CHK __memcpy_chk_ssse3 |
31 | # define MEMPCPY __mempcpy_ssse3 |
32 | # define MEMPCPY_CHK __mempcpy_chk_ssse3 |
33 | # endif |
34 | |
35 | .section .text.ssse3, "ax" , @progbits |
36 | # if defined SHARED |
37 | ENTRY(MEMPCPY_CHK) |
38 | cmp %RDX_LP, %RCX_LP |
39 | jb HIDDEN_JUMPTARGET(__chk_fail) |
40 | END(MEMPCPY_CHK) |
41 | # endif |
42 | |
43 | ENTRY(MEMPCPY) |
44 | mov %RDI_LP, %RAX_LP |
45 | add %RDX_LP, %RAX_LP |
46 | jmp L(start) |
47 | END(MEMPCPY) |
48 | |
49 | # if defined SHARED |
50 | ENTRY(MEMMOVE_CHK) |
51 | cmp %RDX_LP, %RCX_LP |
52 | jb HIDDEN_JUMPTARGET(__chk_fail) |
53 | END(MEMMOVE_CHK) |
54 | # endif |
55 | |
56 | ENTRY_P2ALIGN(MEMMOVE, 6) |
57 | # ifdef __ILP32__ |
58 | /* Clear the upper 32 bits. */ |
59 | movl %edx, %edx |
60 | # endif |
61 | movq %rdi, %rax |
62 | L(start): |
63 | cmpq $16, %rdx |
64 | jb L(copy_0_15) |
65 | |
66 | /* These loads are always useful. */ |
67 | movups 0(%rsi), %xmm0 |
68 | movups -16(%rsi, %rdx), %xmm7 |
69 | cmpq $32, %rdx |
70 | ja L(more_2x_vec) |
71 | |
72 | movups %xmm0, 0(%rdi) |
73 | movups %xmm7, -16(%rdi, %rdx) |
74 | ret |
75 | |
76 | .p2align 4,, 4 |
77 | L(copy_0_15): |
78 | cmpl $4, %edx |
79 | jb L(copy_0_3) |
80 | cmpl $8, %edx |
81 | jb L(copy_4_7) |
82 | movq 0(%rsi), %rcx |
83 | movq -8(%rsi, %rdx), %rsi |
84 | movq %rcx, 0(%rdi) |
85 | movq %rsi, -8(%rdi, %rdx) |
86 | ret |
87 | |
88 | .p2align 4,, 4 |
89 | L(copy_4_7): |
90 | movl 0(%rsi), %ecx |
91 | movl -4(%rsi, %rdx), %esi |
92 | movl %ecx, 0(%rdi) |
93 | movl %esi, -4(%rdi, %rdx) |
94 | ret |
95 | |
96 | .p2align 4,, 4 |
97 | L(copy_0_3): |
98 | decl %edx |
99 | jl L(copy_0_0) |
100 | movb (%rsi), %cl |
101 | je L(copy_1_1) |
102 | |
103 | movzwl -1(%rsi, %rdx), %esi |
104 | movw %si, -1(%rdi, %rdx) |
105 | L(copy_1_1): |
106 | movb %cl, (%rdi) |
107 | L(copy_0_0): |
108 | ret |
109 | |
110 | .p2align 4,, 4 |
111 | L(copy_4x_vec): |
112 | movups 16(%rsi), %xmm1 |
113 | movups -32(%rsi, %rdx), %xmm2 |
114 | |
115 | movups %xmm0, 0(%rdi) |
116 | movups %xmm1, 16(%rdi) |
117 | movups %xmm2, -32(%rdi, %rdx) |
118 | movups %xmm7, -16(%rdi, %rdx) |
119 | L(nop): |
120 | ret |
121 | |
122 | .p2align 4 |
123 | L(more_2x_vec): |
124 | cmpq $64, %rdx |
125 | jbe L(copy_4x_vec) |
126 | |
127 | /* We use rcx later to get alignr value. */ |
128 | movq %rdi, %rcx |
129 | |
130 | /* Backward copy for overlap + dst > src for memmove safety. */ |
131 | subq %rsi, %rcx |
132 | cmpq %rdx, %rcx |
133 | jb L(copy_backward) |
134 | |
135 | /* Load tail. */ |
136 | |
137 | /* -16(%rsi, %rdx) already loaded into xmm7. */ |
138 | movups -32(%rsi, %rdx), %xmm8 |
139 | movups -48(%rsi, %rdx), %xmm9 |
140 | |
141 | /* Get misalignment. */ |
142 | andl $0xf, %ecx |
143 | |
144 | movq %rsi, %r9 |
145 | addq %rcx, %rsi |
146 | andq $-16, %rsi |
147 | /* Get first vec for `palignr`. */ |
148 | movaps (%rsi), %xmm1 |
149 | |
150 | /* We have loaded (%rsi) so safe to do this store before the |
151 | loop. */ |
152 | movups %xmm0, (%rdi) |
153 | |
154 | # ifdef SHARED_CACHE_SIZE_HALF |
155 | cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP |
156 | # else |
157 | cmp __x86_shared_cache_size_half(%rip), %rdx |
158 | # endif |
159 | ja L(large_memcpy) |
160 | |
161 | leaq -64(%rdi, %rdx), %r8 |
162 | andq $-16, %rdi |
163 | movl $48, %edx |
164 | |
165 | leaq L(loop_fwd_start)(%rip), %r9 |
166 | sall $6, %ecx |
167 | addq %r9, %rcx |
168 | jmp * %rcx |
169 | |
170 | .p2align 4,, 8 |
171 | L(copy_backward): |
172 | testq %rcx, %rcx |
173 | jz L(nop) |
174 | |
175 | /* Preload tail. */ |
176 | |
177 | /* (%rsi) already loaded into xmm0. */ |
178 | movups 16(%rsi), %xmm4 |
179 | movups 32(%rsi), %xmm5 |
180 | |
181 | movq %rdi, %r8 |
182 | subq %rdi, %rsi |
183 | leaq -49(%rdi, %rdx), %rdi |
184 | andq $-16, %rdi |
185 | addq %rdi, %rsi |
186 | andq $-16, %rsi |
187 | |
188 | movaps 48(%rsi), %xmm6 |
189 | |
190 | |
191 | leaq L(loop_bkwd_start)(%rip), %r9 |
192 | andl $0xf, %ecx |
193 | sall $6, %ecx |
194 | addq %r9, %rcx |
195 | jmp * %rcx |
196 | |
197 | .p2align 4,, 8 |
198 | L(large_memcpy): |
199 | movups -64(%r9, %rdx), %xmm10 |
200 | movups -80(%r9, %rdx), %xmm11 |
201 | |
202 | sall $5, %ecx |
203 | leal (%rcx, %rcx, 2), %r8d |
204 | leaq -96(%rdi, %rdx), %rcx |
205 | andq $-16, %rdi |
206 | leaq L(large_loop_fwd_start)(%rip), %rdx |
207 | addq %r8, %rdx |
208 | jmp * %rdx |
209 | |
210 | |
211 | /* Instead of a typical jump table all 16 loops are exactly |
212 | 64-bytes in size. So, we can just jump to first loop + r8 * |
213 | 64. Before modifying any loop ensure all their sizes match! |
214 | */ |
215 | .p2align 6 |
216 | L(loop_fwd_start): |
217 | L(loop_fwd_0x0): |
218 | movaps 16(%rsi), %xmm1 |
219 | movaps 32(%rsi), %xmm2 |
220 | movaps 48(%rsi), %xmm3 |
221 | movaps %xmm1, 16(%rdi) |
222 | movaps %xmm2, 32(%rdi) |
223 | movaps %xmm3, 48(%rdi) |
224 | addq %rdx, %rdi |
225 | addq %rdx, %rsi |
226 | cmpq %rdi, %r8 |
227 | ja L(loop_fwd_0x0) |
228 | L(end_loop_fwd): |
229 | movups %xmm9, 16(%r8) |
230 | movups %xmm8, 32(%r8) |
231 | movups %xmm7, 48(%r8) |
232 | ret |
233 | |
234 | /* Exactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding. |
235 | 60 bytes otherwise. */ |
236 | # define ALIGNED_LOOP_FWD(align_by); \ |
237 | .p2align 6; \ |
238 | L(loop_fwd_ ## align_by): \ |
239 | movaps 16(%rsi), %xmm0; \ |
240 | movaps 32(%rsi), %xmm2; \ |
241 | movaps 48(%rsi), %xmm3; \ |
242 | movaps %xmm3, %xmm4; \ |
243 | palignr $align_by, %xmm2, %xmm3; \ |
244 | palignr $align_by, %xmm0, %xmm2; \ |
245 | palignr $align_by, %xmm1, %xmm0; \ |
246 | movaps %xmm4, %xmm1; \ |
247 | movaps %xmm0, 16(%rdi); \ |
248 | movaps %xmm2, 32(%rdi); \ |
249 | movaps %xmm3, 48(%rdi); \ |
250 | addq %rdx, %rdi; \ |
251 | addq %rdx, %rsi; \ |
252 | cmpq %rdi, %r8; \ |
253 | ja L(loop_fwd_ ## align_by); \ |
254 | jmp L(end_loop_fwd); |
255 | |
256 | /* Must be in descending order. */ |
257 | ALIGNED_LOOP_FWD (0xf) |
258 | ALIGNED_LOOP_FWD (0xe) |
259 | ALIGNED_LOOP_FWD (0xd) |
260 | ALIGNED_LOOP_FWD (0xc) |
261 | ALIGNED_LOOP_FWD (0xb) |
262 | ALIGNED_LOOP_FWD (0xa) |
263 | ALIGNED_LOOP_FWD (0x9) |
264 | ALIGNED_LOOP_FWD (0x8) |
265 | ALIGNED_LOOP_FWD (0x7) |
266 | ALIGNED_LOOP_FWD (0x6) |
267 | ALIGNED_LOOP_FWD (0x5) |
268 | ALIGNED_LOOP_FWD (0x4) |
269 | ALIGNED_LOOP_FWD (0x3) |
270 | ALIGNED_LOOP_FWD (0x2) |
271 | ALIGNED_LOOP_FWD (0x1) |
272 | |
273 | .p2align 6 |
274 | L(large_loop_fwd_start): |
275 | L(large_loop_fwd_0x0): |
276 | movaps 16(%rsi), %xmm1 |
277 | movaps 32(%rsi), %xmm2 |
278 | movaps 48(%rsi), %xmm3 |
279 | movaps 64(%rsi), %xmm4 |
280 | movaps 80(%rsi), %xmm5 |
281 | movntps %xmm1, 16(%rdi) |
282 | movntps %xmm2, 32(%rdi) |
283 | movntps %xmm3, 48(%rdi) |
284 | movntps %xmm4, 64(%rdi) |
285 | movntps %xmm5, 80(%rdi) |
286 | addq $80, %rdi |
287 | addq $80, %rsi |
288 | cmpq %rdi, %rcx |
289 | ja L(large_loop_fwd_0x0) |
290 | |
291 | /* Ensure no icache line split on tail. */ |
292 | .p2align 4 |
293 | L(end_large_loop_fwd): |
294 | sfence |
295 | movups %xmm11, 16(%rcx) |
296 | movups %xmm10, 32(%rcx) |
297 | movups %xmm9, 48(%rcx) |
298 | movups %xmm8, 64(%rcx) |
299 | movups %xmm7, 80(%rcx) |
300 | ret |
301 | |
302 | |
303 | /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure |
304 | 96-byte spacing between each. */ |
305 | # define ALIGNED_LARGE_LOOP_FWD(align_by); \ |
306 | .p2align 5; \ |
307 | L(large_loop_fwd_ ## align_by): \ |
308 | movaps 16(%rsi), %xmm0; \ |
309 | movaps 32(%rsi), %xmm2; \ |
310 | movaps 48(%rsi), %xmm3; \ |
311 | movaps 64(%rsi), %xmm4; \ |
312 | movaps 80(%rsi), %xmm5; \ |
313 | movaps %xmm5, %xmm6; \ |
314 | palignr $align_by, %xmm4, %xmm5; \ |
315 | palignr $align_by, %xmm3, %xmm4; \ |
316 | palignr $align_by, %xmm2, %xmm3; \ |
317 | palignr $align_by, %xmm0, %xmm2; \ |
318 | palignr $align_by, %xmm1, %xmm0; \ |
319 | movaps %xmm6, %xmm1; \ |
320 | movntps %xmm0, 16(%rdi); \ |
321 | movntps %xmm2, 32(%rdi); \ |
322 | movntps %xmm3, 48(%rdi); \ |
323 | movntps %xmm4, 64(%rdi); \ |
324 | movntps %xmm5, 80(%rdi); \ |
325 | addq $80, %rdi; \ |
326 | addq $80, %rsi; \ |
327 | cmpq %rdi, %rcx; \ |
328 | ja L(large_loop_fwd_ ## align_by); \ |
329 | jmp L(end_large_loop_fwd); |
330 | |
331 | /* Must be in descending order. */ |
332 | ALIGNED_LARGE_LOOP_FWD (0xf) |
333 | ALIGNED_LARGE_LOOP_FWD (0xe) |
334 | ALIGNED_LARGE_LOOP_FWD (0xd) |
335 | ALIGNED_LARGE_LOOP_FWD (0xc) |
336 | ALIGNED_LARGE_LOOP_FWD (0xb) |
337 | ALIGNED_LARGE_LOOP_FWD (0xa) |
338 | ALIGNED_LARGE_LOOP_FWD (0x9) |
339 | ALIGNED_LARGE_LOOP_FWD (0x8) |
340 | ALIGNED_LARGE_LOOP_FWD (0x7) |
341 | ALIGNED_LARGE_LOOP_FWD (0x6) |
342 | ALIGNED_LARGE_LOOP_FWD (0x5) |
343 | ALIGNED_LARGE_LOOP_FWD (0x4) |
344 | ALIGNED_LARGE_LOOP_FWD (0x3) |
345 | ALIGNED_LARGE_LOOP_FWD (0x2) |
346 | ALIGNED_LARGE_LOOP_FWD (0x1) |
347 | |
348 | |
349 | .p2align 6 |
350 | L(loop_bkwd_start): |
351 | L(loop_bkwd_0x0): |
352 | movaps 32(%rsi), %xmm1 |
353 | movaps 16(%rsi), %xmm2 |
354 | movaps 0(%rsi), %xmm3 |
355 | movaps %xmm1, 32(%rdi) |
356 | movaps %xmm2, 16(%rdi) |
357 | movaps %xmm3, 0(%rdi) |
358 | subq $48, %rdi |
359 | subq $48, %rsi |
360 | cmpq %rdi, %r8 |
361 | jb L(loop_bkwd_0x0) |
362 | L(end_loop_bkwd): |
363 | movups %xmm7, -16(%r8, %rdx) |
364 | movups %xmm0, 0(%r8) |
365 | movups %xmm4, 16(%r8) |
366 | movups %xmm5, 32(%r8) |
367 | |
368 | ret |
369 | |
370 | |
371 | /* Exactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding. |
372 | 60 bytes otherwise. */ |
373 | # define ALIGNED_LOOP_BKWD(align_by); \ |
374 | .p2align 6; \ |
375 | L(loop_bkwd_ ## align_by): \ |
376 | movaps 32(%rsi), %xmm1; \ |
377 | movaps 16(%rsi), %xmm2; \ |
378 | movaps 0(%rsi), %xmm3; \ |
379 | palignr $align_by, %xmm1, %xmm6; \ |
380 | palignr $align_by, %xmm2, %xmm1; \ |
381 | palignr $align_by, %xmm3, %xmm2; \ |
382 | movaps %xmm6, 32(%rdi); \ |
383 | movaps %xmm1, 16(%rdi); \ |
384 | movaps %xmm2, 0(%rdi); \ |
385 | subq $48, %rdi; \ |
386 | subq $48, %rsi; \ |
387 | movaps %xmm3, %xmm6; \ |
388 | cmpq %rdi, %r8; \ |
389 | jb L(loop_bkwd_ ## align_by); \ |
390 | jmp L(end_loop_bkwd); |
391 | |
392 | /* Must be in descending order. */ |
393 | ALIGNED_LOOP_BKWD (0xf) |
394 | ALIGNED_LOOP_BKWD (0xe) |
395 | ALIGNED_LOOP_BKWD (0xd) |
396 | ALIGNED_LOOP_BKWD (0xc) |
397 | ALIGNED_LOOP_BKWD (0xb) |
398 | ALIGNED_LOOP_BKWD (0xa) |
399 | ALIGNED_LOOP_BKWD (0x9) |
400 | ALIGNED_LOOP_BKWD (0x8) |
401 | ALIGNED_LOOP_BKWD (0x7) |
402 | ALIGNED_LOOP_BKWD (0x6) |
403 | ALIGNED_LOOP_BKWD (0x5) |
404 | ALIGNED_LOOP_BKWD (0x4) |
405 | ALIGNED_LOOP_BKWD (0x3) |
406 | ALIGNED_LOOP_BKWD (0x2) |
407 | ALIGNED_LOOP_BKWD (0x1) |
408 | END(MEMMOVE) |
409 | |
410 | strong_alias (MEMMOVE, MEMCPY) |
411 | # if defined SHARED |
412 | strong_alias (MEMMOVE_CHK, MEMCPY_CHK) |
413 | # endif |
414 | #endif |
415 | |