1 | /* strcpy with AVX2 |
2 | Copyright (C) 2011-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef VEC_SIZE |
26 | # include "x86-avx-vecs.h" |
27 | # endif |
28 | |
29 | # ifndef STRCPY |
30 | # define STRCPY __strcpy_avx2 |
31 | # endif |
32 | |
33 | /* Use movsb in page cross case to save code size. */ |
34 | # define USE_MOVSB_IN_PAGE_CROSS 1 |
35 | |
36 | # ifdef USE_AS_WCSCPY |
37 | # define VPCMPEQ vpcmpeqd |
38 | # define VPMIN vpminud |
39 | # define CHAR_SIZE 4 |
40 | # else |
41 | # define VPCMPEQ vpcmpeqb |
42 | # define VPMIN vpminub |
43 | # define CHAR_SIZE 1 |
44 | # endif |
45 | |
46 | # define PAGE_SIZE 4096 |
47 | |
48 | # ifdef USE_AS_STPCPY |
49 | # define END_REG rax |
50 | # else |
51 | # define END_REG rdi, %rdx |
52 | # endif |
53 | |
54 | # ifdef USE_AS_STRCAT |
55 | # define PAGE_ALIGN_REG ecx |
56 | # else |
57 | # define PAGE_ALIGN_REG eax |
58 | # endif |
59 | |
60 | # define VZERO VMM(7) |
61 | # define VZERO_128 VMM_128(7) |
62 | |
63 | .section SECTION(.text), "ax" , @progbits |
64 | ENTRY(STRCPY) |
65 | vpxor %VZERO_128, %VZERO_128, %VZERO_128 |
66 | |
67 | # ifdef USE_AS_STRCAT |
68 | movq %rdi, %rax |
69 | # include "strcat-strlen-avx2.h.S" |
70 | # endif |
71 | |
72 | movl %esi, %PAGE_ALIGN_REG |
73 | andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG |
74 | cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG |
75 | ja L(page_cross) |
76 | L(page_cross_continue): |
77 | # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT |
78 | movq %rdi, %rax |
79 | # endif |
80 | VMOVU (%rsi), %VMM(0) |
81 | VPCMPEQ %VMM(0), %VZERO, %VMM(6) |
82 | vpmovmskb %VMM(6), %ecx |
83 | |
84 | testl %ecx, %ecx |
85 | jz L(more_1x_vec) |
86 | |
87 | /* No longer need ymm registers so just vzeroupper so it doesn't |
88 | need to be duplicated at each return statement. */ |
89 | COND_VZEROUPPER |
90 | |
91 | xorl %edx, %edx |
92 | bsfl %ecx, %edx |
93 | # ifdef USE_AS_STPCPY |
94 | leaq (%rdi, %rdx), %rax |
95 | # endif |
96 | |
97 | /* Use mask bits in rcx to detect which copy we need. If the low |
98 | mask is zero then there must be a bit set in the upper half. |
99 | I.e if ecx != 0 and cx == 0, then match must be upper 16 |
100 | bits so we use L(copy_16_31). */ |
101 | testw %cx, %cx |
102 | jz L(copy_16_31) |
103 | |
104 | testb %cl, %cl |
105 | jz L(copy_8_15) |
106 | # ifdef USE_AS_WCSCPY |
107 | vmovd %xmm0, (%rdi) |
108 | movl $0, (%END_REG) |
109 | ret |
110 | # else |
111 | testb $0x7, %cl |
112 | jz L(copy_4_7) |
113 | |
114 | testl %edx, %edx |
115 | jz L(set_null_term) |
116 | vmovd %xmm0, %ecx |
117 | movw %cx, (%rdi) |
118 | |
119 | .p2align 4,, 2 |
120 | L(set_null_term): |
121 | movb $0, (%END_REG) |
122 | ret |
123 | |
124 | .p2align 4,, 12 |
125 | L(copy_4_7): |
126 | movl -3(%rsi, %rdx), %ecx |
127 | vmovd %xmm0, (%rdi) |
128 | movl %ecx, -3(%END_REG) |
129 | ret |
130 | # endif |
131 | |
132 | .p2align 4,, 10 |
133 | L(copy_16_31): |
134 | VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1 |
135 | VMOVU %xmm0, (%rdi) |
136 | VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG) |
137 | ret |
138 | |
139 | .p2align 4,, 10 |
140 | L(copy_8_15): |
141 | # ifdef USE_AS_WCSCPY |
142 | movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx |
143 | # else |
144 | movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx |
145 | # endif |
146 | vmovq %xmm0, (%rdi) |
147 | movq %rcx, -(8 - CHAR_SIZE)(%END_REG) |
148 | ret |
149 | |
150 | |
151 | .p2align 4,, 8 |
152 | L(more_1x_vec): |
153 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
154 | VMOVU %VMM(0), (%rdi) |
155 | # endif |
156 | subq %rsi, %rdi |
157 | orq $(VEC_SIZE - 1), %rsi |
158 | addq %rsi, %rdi |
159 | VMOVA 1(%rsi), %VMM(1) |
160 | |
161 | /* Try and order stores after as many loads as is reasonable to |
162 | avoid potential false dependencies. */ |
163 | # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT |
164 | VMOVU %VMM(0), (%rax) |
165 | # endif |
166 | VPCMPEQ %VMM(1), %VZERO, %VMM(6) |
167 | vpmovmskb %VMM(6), %ecx |
168 | testl %ecx, %ecx |
169 | jnz L(ret_vec_x1) |
170 | |
171 | VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2) |
172 | VMOVU %VMM(1), 1(%rdi) |
173 | |
174 | VPCMPEQ %VMM(2), %VZERO, %VMM(6) |
175 | vpmovmskb %VMM(6), %ecx |
176 | testl %ecx, %ecx |
177 | jnz L(ret_vec_x2) |
178 | |
179 | VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3) |
180 | VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi) |
181 | |
182 | VPCMPEQ %VMM(3), %VZERO, %VMM(6) |
183 | vpmovmskb %VMM(6), %ecx |
184 | testl %ecx, %ecx |
185 | jnz L(ret_vec_x3) |
186 | |
187 | VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4) |
188 | VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi) |
189 | VPCMPEQ %VMM(4), %VZERO, %VMM(6) |
190 | vpmovmskb %VMM(6), %edx |
191 | testl %edx, %edx |
192 | jnz L(ret_vec_x4) |
193 | |
194 | VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi) |
195 | |
196 | /* Subtract rsi from rdi before aligning. Adding back rsi will |
197 | get proper rdi (dst) for new src. */ |
198 | subq %rsi, %rdi |
199 | incq %rsi |
200 | orq $(VEC_SIZE * 4 - 1), %rsi |
201 | |
202 | /* Do first half of loop ahead of time so loop can just start by |
203 | storing. */ |
204 | VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0) |
205 | VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1) |
206 | VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2) |
207 | VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3) |
208 | |
209 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
210 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
211 | VPMIN %VMM(4), %VMM(6), %VMM(6) |
212 | VPCMPEQ %VMM(6), %VZERO, %VMM(6) |
213 | vpmovmskb %VMM(6), %edx |
214 | addq %rsi, %rdi |
215 | |
216 | testl %edx, %edx |
217 | jnz L(loop_4x_done) |
218 | |
219 | .p2align 4,, 11 |
220 | L(loop_4x_vec): |
221 | |
222 | VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi) |
223 | VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi) |
224 | subq $(VEC_SIZE * -4), %rsi |
225 | VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi) |
226 | VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi) |
227 | |
228 | |
229 | VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0) |
230 | VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1) |
231 | VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2) |
232 | VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3) |
233 | |
234 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
235 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
236 | VPMIN %VMM(4), %VMM(6), %VMM(6) |
237 | VPCMPEQ %VMM(6), %VZERO, %VMM(6) |
238 | |
239 | vpmovmskb %VMM(6), %edx |
240 | subq $(VEC_SIZE * -4), %rdi |
241 | testl %edx, %edx |
242 | jz L(loop_4x_vec) |
243 | |
244 | L(loop_4x_done): |
245 | VPCMPEQ %VMM(0), %VZERO, %VMM(6) |
246 | vpmovmskb %VMM(6), %ecx |
247 | testl %ecx, %ecx |
248 | jnz L(ret_vec_x1) |
249 | VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi) |
250 | |
251 | VPCMPEQ %VMM(1), %VZERO, %VMM(6) |
252 | vpmovmskb %VMM(6), %ecx |
253 | testl %ecx, %ecx |
254 | jnz L(ret_vec_x2) |
255 | VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi) |
256 | |
257 | VPCMPEQ %VMM(2), %VZERO, %VMM(6) |
258 | vpmovmskb %VMM(6), %ecx |
259 | testl %ecx, %ecx |
260 | jnz L(ret_vec_x3) |
261 | VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi) |
262 | L(ret_vec_x4): |
263 | bsfl %edx, %edx |
264 | VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) |
265 | VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
266 | # ifdef USE_AS_STPCPY |
267 | leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax |
268 | # endif |
269 | L(return_end): |
270 | VZEROUPPER_RETURN |
271 | |
272 | .p2align 4,, 8 |
273 | L(ret_vec_x1): |
274 | bsfl %ecx, %ecx |
275 | VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) |
276 | VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) |
277 | # ifdef USE_AS_STPCPY |
278 | leaq 1(%rcx, %rdi), %rax |
279 | # endif |
280 | L(return_vzeroupper): |
281 | ZERO_UPPER_VEC_REGISTERS_RETURN |
282 | |
283 | .p2align 4,, 8 |
284 | L(ret_vec_x2): |
285 | bsfl %ecx, %ecx |
286 | VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) |
287 | VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) |
288 | # ifdef USE_AS_STPCPY |
289 | leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax |
290 | # endif |
291 | VZEROUPPER_RETURN |
292 | |
293 | .p2align 4,, 8 |
294 | L(ret_vec_x3): |
295 | bsfl %ecx, %ecx |
296 | VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) |
297 | VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) |
298 | # ifdef USE_AS_STPCPY |
299 | leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax |
300 | # endif |
301 | VZEROUPPER_RETURN |
302 | |
303 | |
304 | .p2align 4,, 4 |
305 | L(page_cross): |
306 | movq %rsi, %rcx |
307 | andq $(VEC_SIZE * -1), %rcx |
308 | |
309 | VPCMPEQ (%rcx), %VZERO, %VMM(6) |
310 | vpmovmskb %VMM(6), %ecx |
311 | shrxl %esi, %ecx, %ecx |
312 | # if USE_MOVSB_IN_PAGE_CROSS |
313 | /* Optimizing more aggressively for space as this is very cold |
314 | code. This saves 2x cache lines. */ |
315 | |
316 | /* This adds once to the later result which will get correct |
317 | copy bounds. NB: this can never zero-out a non-zero RCX as |
318 | to be in the page cross case rsi cannot be aligned and we |
319 | already right-shift rcx by the misalignment. */ |
320 | shll $CHAR_SIZE, %ecx |
321 | jz L(page_cross_continue) |
322 | bsfl %ecx, %ecx |
323 | # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT |
324 | movq %rdi, %rax |
325 | # endif |
326 | rep movsb |
327 | # ifdef USE_AS_STPCPY |
328 | leaq -CHAR_SIZE(%rdi), %rax |
329 | # endif |
330 | |
331 | VZEROUPPER_RETURN |
332 | |
333 | # else |
334 | testl %ecx, %ecx |
335 | jz L(page_cross_continue) |
336 | |
337 | /* Traditional copy case, essentially same as used in non-page- |
338 | cross case but since we can't reuse VMM(0) we need twice as |
339 | many loads from rsi. */ |
340 | # ifndef USE_AS_STRCAT |
341 | xorl %edx, %edx |
342 | # endif |
343 | bsfl %ecx, %edx |
344 | # ifdef USE_AS_STPCPY |
345 | leaq (%rdi, %rdx), %rax |
346 | # elif !defined USE_AS_STRCAT |
347 | movq %rdi, %rax |
348 | # endif |
349 | |
350 | /* vzeroupper early to avoid duplicating at each return. */ |
351 | COND_VZEROUPPER |
352 | |
353 | testw %cx, %cx |
354 | jz L(page_cross_copy_16_31) |
355 | |
356 | testb %cl, %cl |
357 | jz L(page_cross_copy_8_15) |
358 | |
359 | testl $0x7, %cl |
360 | jz L(page_cross_copy_4_7) |
361 | |
362 | testl %edx, %edx |
363 | jz L(page_cross_set_null_term) |
364 | movzwl (%rsi), %ecx |
365 | movw %cx, (%rdi) |
366 | L(page_cross_set_null_term): |
367 | movb $0, (%END_REG) |
368 | ret |
369 | |
370 | .p2align 4,, 4 |
371 | L(page_cross_copy_4_7): |
372 | movl (%rsi), %ecx |
373 | movl -3(%rsi, %rdx), %esi |
374 | movl %ecx, (%rdi) |
375 | movl %esi, -3(%END_REG) |
376 | ret |
377 | |
378 | .p2align 4,, 4 |
379 | L(page_cross_copy_8_15): |
380 | movq (%rsi), %rcx |
381 | movq -7(%rsi, %rdx), %rsi |
382 | movq %rcx, (%rdi) |
383 | movq %rsi, -7(%END_REG) |
384 | ret |
385 | |
386 | |
387 | .p2align 4,, 3 |
388 | L(page_cross_copy_16_31): |
389 | VMOVU (%rsi), %xmm0 |
390 | VMOVU -15(%rsi, %rdx), %xmm1 |
391 | VMOVU %xmm0, (%rdi) |
392 | VMOVU %xmm1, -15(%END_REG) |
393 | ret |
394 | # endif |
395 | |
396 | END(STRCPY) |
397 | #endif |
398 | |