1 | /* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. |
2 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | /* Use evex-masked stores for small sizes. Turned off at the |
24 | moment. */ |
25 | # define USE_EVEX_MASKED_STORE 0 |
26 | |
27 | |
28 | # include <sysdep.h> |
29 | # ifndef VEC_SIZE |
30 | # include "x86-evex256-vecs.h" |
31 | # endif |
32 | |
33 | |
34 | # ifndef STRNCPY |
35 | # define STRNCPY __strncpy_evex |
36 | # endif |
37 | |
38 | # ifdef USE_AS_WCSCPY |
39 | # define VMOVU_MASK vmovdqu32 |
40 | # define VPCMPEQ vpcmpeqd |
41 | # define VPMIN vpminud |
42 | # define VPTESTN vptestnmd |
43 | # define VPTEST vptestmd |
44 | # define CHAR_SIZE 4 |
45 | |
46 | # define REP_MOVS rep movsd |
47 | # define REP_STOS rep stosl |
48 | |
49 | # define USE_WIDE_CHAR |
50 | |
51 | # else |
52 | # define VMOVU_MASK vmovdqu8 |
53 | # define VPCMPEQ vpcmpeqb |
54 | # define VPMIN vpminub |
55 | # define VPTESTN vptestnmb |
56 | # define VPTEST vptestmb |
57 | # define CHAR_SIZE 1 |
58 | |
59 | # define REP_MOVS rep movsb |
60 | # define REP_STOS rep stosb |
61 | # endif |
62 | |
63 | # include "strncpy-or-cat-overflow-def.h" |
64 | |
65 | # define PAGE_SIZE 4096 |
66 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
67 | |
68 | # include "reg-macros.h" |
69 | |
70 | |
71 | # define VZERO VMM(7) |
72 | # define VZERO_256 VMM_256(7) |
73 | # define VZERO_128 VMM_128(7) |
74 | |
75 | # if VEC_SIZE == 64 |
76 | # define VZERO_HALF VZERO_256 |
77 | # else |
78 | # define VZERO_HALF VZERO_128 |
79 | # endif |
80 | |
81 | .section SECTION(.text), "ax" , @progbits |
82 | ENTRY(STRNCPY) |
83 | # ifdef __ILP32__ |
84 | /* Clear the upper 32 bits. */ |
85 | movl %edx, %edx |
86 | # endif |
87 | /* Filter zero length strings and very long strings. Zero |
88 | length strings just return, very long strings are handled by |
89 | just running rep stos{b|l} to zero set (which will almost |
90 | certainly segfault), if that succeeds then just calling |
91 | OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ |
92 | # ifdef USE_AS_WCSCPY |
93 | decq %rdx |
94 | movq %rdx, %rax |
95 | /* 56 is end of max supported address space. */ |
96 | shr $56, %rax |
97 | jnz L(zero_len) |
98 | # else |
99 | decq %rdx |
100 | /* If the flag needs to become `jb` replace `dec` with `sub`. |
101 | */ |
102 | jl L(zero_len) |
103 | # endif |
104 | |
105 | vpxorq %VZERO_128, %VZERO_128, %VZERO_128 |
106 | movl %esi, %eax |
107 | andl $(PAGE_SIZE - 1), %eax |
108 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
109 | ja L(page_cross) |
110 | |
111 | L(page_cross_continue): |
112 | VMOVU (%rsi), %VMM(0) |
113 | VPTESTN %VMM(0), %VMM(0), %k0 |
114 | KMOV %k0, %VRCX |
115 | |
116 | /* If no STPCPY just save end ahead of time. */ |
117 | # ifndef USE_AS_STPCPY |
118 | movq %rdi, %rax |
119 | # endif |
120 | |
121 | |
122 | cmpq $(CHAR_PER_VEC), %rdx |
123 | |
124 | /* If USE_EVEX_MASK_STORE is enabled then we just handle length |
125 | <= CHAR_PER_VEC with masked instructions (which have |
126 | potential for dramatically bad perf if dst splits a page and |
127 | is not in the TLB). */ |
128 | # if USE_EVEX_MASKED_STORE |
129 | /* `jae` because length rdx is now length - 1. */ |
130 | jae L(more_1x_vec) |
131 | |
132 | /* If there where multiple zero-CHAR matches in the first VEC, |
133 | VRCX will be overset but thats fine since any oversets where |
134 | at zero-positions anyways. */ |
135 | |
136 | # ifdef USE_AS_STPCPY |
137 | tzcnt %VRCX, %VRAX |
138 | cmpl %eax, %edx |
139 | cmovb %edx, %eax |
140 | # ifdef USE_AS_WCSCPY |
141 | adcl $0, %eax |
142 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
143 | # else |
144 | adcq %rdi, %rax |
145 | # endif |
146 | # endif |
147 | dec %VRCX |
148 | |
149 | /* Zero out all non-zero CHAR's after the first zero match. */ |
150 | KMOV %VRCX, %k1 |
151 | |
152 | /* Use VZERO as destination so this can be reused for |
153 | L(zfill_less_vec) (which if jumped to by subsequent logic |
154 | will have zerod out VZERO. */ |
155 | VMOVU_MASK %VMM(0), %VZERO{%k1}{z} |
156 | L(zfill_less_vec): |
157 | /* Get mask for what we need to set. */ |
158 | incl %edx |
159 | mov $-1, %VRCX |
160 | bzhi %VRDX, %VRCX, %VRCX |
161 | KMOV %VRCX, %k1 |
162 | VMOVU_MASK %VZERO, (%rdi){%k1} |
163 | ret |
164 | |
165 | .p2align 4,, 4 |
166 | L(zero_len): |
167 | cmpq $-1, %rdx |
168 | jne L(best_effort_strncpy) |
169 | movq %rdi, %rax |
170 | ret |
171 | |
172 | .p2align 4,, 8 |
173 | L(more_1x_vec): |
174 | # else |
175 | /* `jb` because length rdx is now length - 1. */ |
176 | jb L(less_1x_vec) |
177 | # endif |
178 | |
179 | |
180 | /* This may overset but thats fine because we still need to zero |
181 | fill. */ |
182 | VMOVU %VMM(0), (%rdi) |
183 | |
184 | |
185 | /* Length must be >= CHAR_PER_VEC so match here means we must |
186 | zero-fill. */ |
187 | test %VRCX, %VRCX |
188 | jnz L(zfill) |
189 | |
190 | |
191 | /* We are going to align rsi here so will need to be able to re- |
192 | adjust rdi/rdx afterwords. NB: We filtered out huge lengths |
193 | so rsi + rdx * CHAR_SIZE cannot overflow. */ |
194 | leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx |
195 | subq %rsi, %rdi |
196 | andq $-(VEC_SIZE), %rsi |
197 | |
198 | L(loop_last_4x_vec): |
199 | addq %rsi, %rdi |
200 | subq %rsi, %rdx |
201 | # ifdef USE_AS_WCSCPY |
202 | shrq $2, %rdx |
203 | # endif |
204 | |
205 | VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) |
206 | VPTESTN %VMM(1), %VMM(1), %k0 |
207 | KMOV %k0, %VRCX |
208 | |
209 | /* -1 because of the `dec %rdx` earlier. */ |
210 | cmpq $(CHAR_PER_VEC * 2 - 1), %rdx |
211 | ja L(more_2x_vec) |
212 | |
213 | L(last_2x_vec): |
214 | /* This will be need to be computed no matter what. We do it |
215 | ahead of time for CHAR_PER_VEC == 64 because we can't adjust |
216 | the value of `tzcnt` with a shift. */ |
217 | # if CHAR_PER_VEC == 64 |
218 | tzcntq %rcx, %rcx |
219 | # endif |
220 | |
221 | cmpl $(CHAR_PER_VEC), %edx |
222 | jb L(ret_vec_x1_len) |
223 | |
224 | /* Seperate logic for CHAR_PER_VEC == 64 because we already did |
225 | `tzcnt` on VRCX. */ |
226 | # if CHAR_PER_VEC == 64 |
227 | /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ |
228 | cmpb $CHAR_PER_VEC, %cl |
229 | jnz L(ret_vec_x1_no_bsf) |
230 | # else |
231 | test %VRCX, %VRCX |
232 | jnz L(ret_vec_x1) |
233 | # endif |
234 | |
235 | |
236 | |
237 | VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 |
238 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
239 | KMOV %k0, %VRCX |
240 | |
241 | # if CHAR_PER_VEC < 64 |
242 | /* This essentiallys adds CHAR_PER_VEC to computed result. */ |
243 | shlq $CHAR_PER_VEC, %rcx |
244 | # else |
245 | tzcntq %rcx, %rcx |
246 | addl $CHAR_PER_VEC, %ecx |
247 | # endif |
248 | |
249 | .p2align 4,, 4 |
250 | L(ret_vec_x1_len): |
251 | /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has |
252 | already been done. */ |
253 | # if CHAR_PER_VEC < 64 |
254 | tzcntq %rcx, %rcx |
255 | # endif |
256 | cmpl %ecx, %edx |
257 | jbe L(ret_vec_x1_len_no_zfill) |
258 | /* Fall through (expectation) is copy len < buffer len. */ |
259 | VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
260 | L(ret_vec_x1_len_no_zfill_mov): |
261 | movl %ecx, %edx |
262 | # ifdef USE_AS_STPCPY |
263 | /* clear flags. */ |
264 | xorl %ecx, %ecx |
265 | # endif |
266 | L(ret_vec_x1_len_no_zfill): |
267 | VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
268 | VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
269 | # ifdef USE_AS_STPCPY |
270 | # ifdef USE_AS_WCSCPY |
271 | adcq $0, %rdx |
272 | leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax |
273 | # else |
274 | leal (VEC_SIZE)(%rdx), %eax |
275 | adcq %rdi, %rax |
276 | # endif |
277 | # endif |
278 | ret |
279 | |
280 | |
281 | .p2align 4,, 10 |
282 | L(ret_vec_x1): |
283 | bsf %VRCX, %VRCX |
284 | L(ret_vec_x1_no_bsf): |
285 | VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
286 | subl %ecx, %edx |
287 | cmpl $CHAR_PER_VEC, %edx |
288 | jb L(ret_vec_x1_len_no_zfill_mov) |
289 | /* Fall through (expectation) is copy len < buffer len. */ |
290 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
291 | VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) |
292 | # ifdef USE_AS_STPCPY |
293 | leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax |
294 | # endif |
295 | ret |
296 | |
297 | .p2align 4,, 8 |
298 | L(last_4x_vec): |
299 | /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl |
300 | $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just |
301 | using `movzbl`. */ |
302 | # if CHAR_PER_VEC == 64 |
303 | movzbl %dl, %edx |
304 | # else |
305 | andl $(CHAR_PER_VEC * 4 - 1), %edx |
306 | # endif |
307 | VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) |
308 | VPTESTN %VMM(1), %VMM(1), %k0 |
309 | KMOV %k0, %VRCX |
310 | subq $-(VEC_SIZE * 4), %rsi |
311 | subq $-(VEC_SIZE * 4), %rdi |
312 | cmpl $(CHAR_PER_VEC * 2 - 1), %edx |
313 | jbe L(last_2x_vec) |
314 | .p2align 4,, 8 |
315 | L(more_2x_vec): |
316 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
317 | test %VRCX, %VRCX |
318 | /* Must fill at least 2x VEC. */ |
319 | jnz L(zfill_vec1) |
320 | |
321 | VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) |
322 | VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) |
323 | VPTESTN %VMM(2), %VMM(2), %k0 |
324 | KMOV %k0, %VRCX |
325 | test %VRCX, %VRCX |
326 | /* Must fill at least 1x VEC. */ |
327 | jnz L(zfill_vec2) |
328 | |
329 | VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) |
330 | VPTESTN %VMM(3), %VMM(3), %k0 |
331 | KMOV %k0, %VRCX |
332 | |
333 | /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ |
334 | cmpq $(CHAR_PER_VEC * 4 - 1), %rdx |
335 | ja L(more_4x_vec) |
336 | |
337 | subl $(CHAR_PER_VEC * 3), %edx |
338 | jb L(ret_vec_x3_len) |
339 | |
340 | test %VRCX, %VRCX |
341 | jnz L(ret_vec_x3) |
342 | |
343 | VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 |
344 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
345 | KMOV %k0, %VRCX |
346 | tzcnt %VRCX, %VRCX |
347 | cmpl %ecx, %edx |
348 | jbe L(ret_vec_x4_len_no_zfill) |
349 | /* Fall through (expectation) is copy len < buffer len. */ |
350 | VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
351 | movl %ecx, %edx |
352 | L(ret_vec_x4_len_no_zfill): |
353 | VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
354 | VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
355 | # ifdef USE_AS_STPCPY |
356 | # ifdef USE_AS_WCSCPY |
357 | adcq $0, %rdx |
358 | leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax |
359 | # else |
360 | leal (VEC_SIZE * 4 + 0)(%rdx), %eax |
361 | adcq %rdi, %rax |
362 | # endif |
363 | # endif |
364 | ret |
365 | |
366 | |
367 | L(ret_vec_x3_len): |
368 | addl $(CHAR_PER_VEC * 1), %edx |
369 | tzcnt %VRCX, %VRCX |
370 | cmpl %ecx, %edx |
371 | jbe L(ret_vec_x3_len_no_zfill) |
372 | /* Fall through (expectation) is copy len < buffer len. */ |
373 | VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
374 | L(ret_vec_x3_len_no_zfill_mov): |
375 | movl %ecx, %edx |
376 | # ifdef USE_AS_STPCPY |
377 | /* clear flags. */ |
378 | xorl %ecx, %ecx |
379 | # endif |
380 | .p2align 4,, 4 |
381 | L(ret_vec_x3_len_no_zfill): |
382 | VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
383 | VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) |
384 | # ifdef USE_AS_STPCPY |
385 | # ifdef USE_AS_WCSCPY |
386 | adcq $0, %rdx |
387 | leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax |
388 | # else |
389 | leal (VEC_SIZE * 3 + 0)(%rdx), %eax |
390 | adcq %rdi, %rax |
391 | # endif |
392 | # endif |
393 | ret |
394 | |
395 | |
396 | .p2align 4,, 8 |
397 | L(ret_vec_x3): |
398 | bsf %VRCX, %VRCX |
399 | VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) |
400 | subl %ecx, %edx |
401 | jl L(ret_vec_x3_len_no_zfill_mov) |
402 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
403 | VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) |
404 | # ifdef USE_AS_STPCPY |
405 | leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax |
406 | # endif |
407 | ret |
408 | |
409 | .p2align 4,, 8 |
410 | L(more_4x_vec): |
411 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
412 | test %VRCX, %VRCX |
413 | jnz L(zfill_vec3) |
414 | |
415 | VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) |
416 | VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) |
417 | VPTESTN %VMM(4), %VMM(4), %k0 |
418 | KMOV %k0, %VRCX |
419 | test %VRCX, %VRCX |
420 | jnz L(zfill_vec4) |
421 | |
422 | /* Recheck length before aligning. */ |
423 | cmpq $(CHAR_PER_VEC * 8 - 1), %rdx |
424 | jbe L(last_4x_vec) |
425 | |
426 | /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ |
427 | # ifdef USE_AS_WCSCPY |
428 | leaq (%rsi, %rdx, CHAR_SIZE), %rdx |
429 | # else |
430 | addq %rsi, %rdx |
431 | # endif |
432 | subq %rsi, %rdi |
433 | subq $-(VEC_SIZE * 5), %rsi |
434 | andq $(VEC_SIZE * -4), %rsi |
435 | |
436 | |
437 | /* Load first half of the loop before entry. */ |
438 | VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) |
439 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) |
440 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) |
441 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) |
442 | |
443 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
444 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
445 | VPTESTN %VMM(4), %VMM(4), %k2 |
446 | VPTESTN %VMM(6), %VMM(6), %k4 |
447 | |
448 | |
449 | /* Offset rsi by VEC_SIZE so that we can jump to |
450 | L(loop_last_4x_vec). */ |
451 | addq $-(VEC_SIZE), %rsi |
452 | KORTEST %k2, %k4 |
453 | jnz L(loop_4x_done) |
454 | |
455 | /* Store loop end in r9. */ |
456 | leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 |
457 | |
458 | .p2align 4,, 11 |
459 | L(loop_4x_vec): |
460 | VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) |
461 | VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) |
462 | VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) |
463 | VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) |
464 | |
465 | subq $(VEC_SIZE * -4), %rsi |
466 | cmpq %rsi, %r9 |
467 | jbe L(loop_last_4x_vec) |
468 | |
469 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) |
470 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) |
471 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) |
472 | VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) |
473 | |
474 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
475 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
476 | VPTESTN %VMM(4), %VMM(4), %k2 |
477 | VPTESTN %VMM(6), %VMM(6), %k4 |
478 | KORTEST %k2, %k4 |
479 | jz L(loop_4x_vec) |
480 | |
481 | L(loop_4x_done): |
482 | /* Restore rdx (length). */ |
483 | subq %rsi, %rdx |
484 | # ifdef USE_AS_WCSCPY |
485 | shrq $2, %rdx |
486 | # endif |
487 | VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) |
488 | /* Restore rdi (dst). */ |
489 | addq %rsi, %rdi |
490 | VPTESTN %VMM(0), %VMM(0), %k0 |
491 | KMOV %k0, %VRCX |
492 | test %VRCX, %VRCX |
493 | jnz L(zfill_vec1) |
494 | |
495 | VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) |
496 | KMOV %k2, %VRCX |
497 | test %VRCX, %VRCX |
498 | jnz L(zfill_vec2) |
499 | |
500 | VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) |
501 | VPTESTN %VMM(2), %VMM(2), %k0 |
502 | KMOV %k0, %VRCX |
503 | test %VRCX, %VRCX |
504 | jnz L(zfill_vec3) |
505 | |
506 | VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) |
507 | KMOV %k4, %VRCX |
508 | // Zfill more.... |
509 | |
510 | .p2align 4,, 4 |
511 | L(zfill_vec4): |
512 | subq $(VEC_SIZE * -2), %rdi |
513 | addq $(CHAR_PER_VEC * -2), %rdx |
514 | L(zfill_vec2): |
515 | subq $(VEC_SIZE * -2), %rdi |
516 | addq $(CHAR_PER_VEC * -1), %rdx |
517 | L(zfill): |
518 | /* VRCX must be non-zero. */ |
519 | bsf %VRCX, %VRCX |
520 | |
521 | /* Adjust length / dst for zfill. */ |
522 | subq %rcx, %rdx |
523 | # ifdef USE_AS_WCSCPY |
524 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
525 | # else |
526 | addq %rcx, %rdi |
527 | # endif |
528 | # ifdef USE_AS_STPCPY |
529 | movq %rdi, %rax |
530 | # endif |
531 | L(zfill_from_page_cross): |
532 | |
533 | /* From here on out its just memset(rdi, 0, rdx). */ |
534 | cmpq $CHAR_PER_VEC, %rdx |
535 | jb L(zfill_less_vec) |
536 | |
537 | L(zfill_more_1x_vec): |
538 | VMOVU %VZERO, (%rdi) |
539 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) |
540 | cmpq $(CHAR_PER_VEC * 2 - 1), %rdx |
541 | ja L(zfill_more_2x_vec) |
542 | L(zfill_done0): |
543 | ret |
544 | |
545 | /* Coming from vec1/vec2 we must be able to zfill at least 2x |
546 | VEC. */ |
547 | .p2align 4,, 8 |
548 | L(zfill_vec3): |
549 | subq $(VEC_SIZE * -2), %rdi |
550 | addq $(CHAR_PER_VEC * -2), %rdx |
551 | .p2align 4,, 2 |
552 | L(zfill_vec1): |
553 | bsfq %rcx, %rcx |
554 | /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. |
555 | */ |
556 | leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi |
557 | subq %rcx, %rdx |
558 | # ifdef USE_AS_STPCPY |
559 | movq %rdi, %rax |
560 | # endif |
561 | |
562 | |
563 | VMOVU %VZERO, (%rdi) |
564 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) |
565 | cmpq $(CHAR_PER_VEC * 2), %rdx |
566 | jb L(zfill_done0) |
567 | L(zfill_more_2x_vec): |
568 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) |
569 | VMOVU %VZERO, (VEC_SIZE)(%rdi) |
570 | subq $(CHAR_PER_VEC * 4 - 1), %rdx |
571 | jbe L(zfill_done) |
572 | |
573 | # ifdef USE_AS_WCSCPY |
574 | leaq (%rdi, %rdx, CHAR_SIZE), %rdx |
575 | # else |
576 | addq %rdi, %rdx |
577 | # endif |
578 | |
579 | VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) |
580 | VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) |
581 | |
582 | |
583 | VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) |
584 | VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) |
585 | |
586 | subq $-(VEC_SIZE * 4), %rdi |
587 | cmpq %rdi, %rdx |
588 | jbe L(zfill_done) |
589 | |
590 | /* Align rdi and zfill loop. */ |
591 | andq $-(VEC_SIZE), %rdi |
592 | .p2align 4,, 12 |
593 | L(zfill_loop_4x_vec): |
594 | VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) |
595 | VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) |
596 | VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) |
597 | VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) |
598 | subq $-(VEC_SIZE * 4), %rdi |
599 | cmpq %rdi, %rdx |
600 | ja L(zfill_loop_4x_vec) |
601 | L(zfill_done): |
602 | ret |
603 | |
604 | |
605 | /* Less 1x VEC case if we are not using evex masked store. */ |
606 | # if !USE_EVEX_MASKED_STORE |
607 | .p2align 4,, 8 |
608 | L(copy_1x): |
609 | /* Special case for copy 1x. It can be handled quickly and many |
610 | buffer sizes have convenient alignment. */ |
611 | VMOVU %VMM(0), (%rdi) |
612 | /* If no zeros then we are done. */ |
613 | testl %ecx, %ecx |
614 | jz L(ret_1x_1x) |
615 | |
616 | /* Need to zfill, not we know that length <= CHAR_PER_VEC so we |
617 | only handle the small case here. */ |
618 | bsf %VRCX, %VRCX |
619 | L(zfill_less_vec_no_bsf): |
620 | /* Adjust length / dst then just zfill less_vec. */ |
621 | subq %rcx, %rdx |
622 | # ifdef USE_AS_WCSCPY |
623 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
624 | # else |
625 | addq %rcx, %rdi |
626 | # endif |
627 | # ifdef USE_AS_STPCPY |
628 | movq %rdi, %rax |
629 | # endif |
630 | |
631 | L(zfill_less_vec): |
632 | cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx |
633 | jb L(zfill_less_half) |
634 | |
635 | VMOVU %VZERO_HALF, (%rdi) |
636 | VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
637 | ret |
638 | # ifdef USE_AS_STPCPY |
639 | L(ret_1x_1x): |
640 | leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax |
641 | ret |
642 | # endif |
643 | |
644 | |
645 | # if VEC_SIZE == 64 |
646 | .p2align 4,, 4 |
647 | L(copy_32_63): |
648 | /* Overfill to avoid branches. */ |
649 | VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) |
650 | VMOVU %VMM_256(0), (%rdi) |
651 | VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
652 | |
653 | /* We are taking advantage of the fact that to be here we must |
654 | be writing null-term as (%rdi, %rcx) we have a byte of lee- |
655 | way for overwriting. */ |
656 | cmpl %ecx, %edx |
657 | ja L(zfill_less_vec_no_bsf) |
658 | # ifndef USE_AS_STPCPY |
659 | L(ret_1x_1x): |
660 | # else |
661 | # ifdef USE_AS_WCSCPY |
662 | adcq $0, %rdx |
663 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
664 | # else |
665 | movl %edx, %eax |
666 | adcq %rdi, %rax |
667 | # endif |
668 | # endif |
669 | ret |
670 | # endif |
671 | |
672 | .p2align 4,, 4 |
673 | L(copy_16_31): |
674 | /* Overfill to avoid branches. */ |
675 | vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 |
676 | VMOVU %VMM_128(0), (%rdi) |
677 | vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
678 | cmpl %ecx, %edx |
679 | |
680 | /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then |
681 | we have a larger copy block for 32-63 so this is just falls |
682 | through to zfill 16-31. If VEC_SIZE == 32 then we check for |
683 | full zfill of less 1x VEC. */ |
684 | # if VEC_SIZE == 64 |
685 | jbe L(ret_16_31) |
686 | subl %ecx, %edx |
687 | # ifdef USE_AS_WCSCPY |
688 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
689 | # else |
690 | addq %rcx, %rdi |
691 | # endif |
692 | # ifdef USE_AS_STPCPY |
693 | movq %rdi, %rax |
694 | # endif |
695 | L(zfill_less_half): |
696 | L(zfill_less_32): |
697 | cmpl $(16 / CHAR_SIZE), %edx |
698 | jb L(zfill_less_16) |
699 | VMOVU %VZERO_128, (%rdi) |
700 | VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
701 | # ifdef USE_AS_STPCPY |
702 | ret |
703 | # endif |
704 | L(ret_16_31): |
705 | # ifdef USE_AS_STPCPY |
706 | # ifdef USE_AS_WCSCPY |
707 | adcq $0, %rdx |
708 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
709 | # else |
710 | movl %edx, %eax |
711 | adcq %rdi, %rax |
712 | # endif |
713 | # endif |
714 | ret |
715 | # else |
716 | /* VEC_SIZE == 32 begins. */ |
717 | ja L(zfill_less_vec_no_bsf) |
718 | # ifndef USE_AS_STPCPY |
719 | L(ret_1x_1x): |
720 | # else |
721 | # ifdef USE_AS_WCSCPY |
722 | adcq $0, %rdx |
723 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
724 | # else |
725 | movl %edx, %eax |
726 | adcq %rdi, %rax |
727 | # endif |
728 | # endif |
729 | ret |
730 | # endif |
731 | |
732 | |
733 | .p2align 4,, 4 |
734 | L(copy_8_15): |
735 | /* Overfill to avoid branches. */ |
736 | movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi |
737 | vmovq %VMM_128(0), (%rdi) |
738 | movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
739 | cmpl %ecx, %edx |
740 | jbe L(ret_8_15) |
741 | subl %ecx, %edx |
742 | # ifdef USE_AS_WCSCPY |
743 | leaq (%rdi, %rcx, CHAR_SIZE), %rdi |
744 | # else |
745 | addq %rcx, %rdi |
746 | # endif |
747 | # ifdef USE_AS_STPCPY |
748 | movq %rdi, %rax |
749 | # endif |
750 | .p2align 4,, 8 |
751 | # if VEC_SIZE == 32 |
752 | L(zfill_less_half): |
753 | # endif |
754 | L(zfill_less_16): |
755 | xorl %ecx, %ecx |
756 | cmpl $(8 / CHAR_SIZE), %edx |
757 | jb L(zfill_less_8) |
758 | movq %rcx, (%rdi) |
759 | movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) |
760 | # ifndef USE_AS_STPCPY |
761 | L(ret_8_15): |
762 | # endif |
763 | ret |
764 | |
765 | .p2align 4,, 8 |
766 | L(less_1x_vec): |
767 | je L(copy_1x) |
768 | |
769 | /* We will need `tzcnt` result for all other copy sizes. */ |
770 | tzcnt %VRCX, %VRCX |
771 | # if VEC_SIZE == 64 |
772 | cmpl $(32 / CHAR_SIZE), %edx |
773 | jae L(copy_32_63) |
774 | # endif |
775 | |
776 | cmpl $(16 / CHAR_SIZE), %edx |
777 | jae L(copy_16_31) |
778 | |
779 | cmpl $(8 / CHAR_SIZE), %edx |
780 | jae L(copy_8_15) |
781 | # ifdef USE_AS_WCSCPY |
782 | testl %ecx, %ecx |
783 | jz L(zfill_less_8_set_ret) |
784 | |
785 | movl (%rsi, %rdx, CHAR_SIZE), %esi |
786 | vmovd %VMM_128(0), (%rdi) |
787 | movl %esi, (%rdi, %rdx, CHAR_SIZE) |
788 | # ifdef USE_AS_STPCPY |
789 | cmpl %ecx, %edx |
790 | L(ret_8_15): |
791 | adcq $0, %rdx |
792 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
793 | # endif |
794 | ret |
795 | L(zfill_less_8_set_ret): |
796 | xorl %ecx, %ecx |
797 | # ifdef USE_AS_STPCPY |
798 | movq %rdi, %rax |
799 | # endif |
800 | L(zfill_less_8): |
801 | movl %ecx, (%rdi) |
802 | movl %ecx, (%rdi, %rdx, CHAR_SIZE) |
803 | ret |
804 | # else |
805 | cmpl $3, %edx |
806 | jb L(copy_0_3) |
807 | /* Overfill to avoid branches. */ |
808 | movl -3(%rsi, %rdx), %esi |
809 | vmovd %VMM_128(0), (%rdi) |
810 | movl %esi, -3(%rdi, %rdx) |
811 | cmpl %ecx, %edx |
812 | jbe L(ret_4_7) |
813 | subq %rcx, %rdx |
814 | addq %rcx, %rdi |
815 | # ifdef USE_AS_STPCPY |
816 | movq %rdi, %rax |
817 | # endif |
818 | xorl %ecx, %ecx |
819 | .p2align 4,, 8 |
820 | L(zfill_less_8): |
821 | cmpl $3, %edx |
822 | jb L(zfill_less_3) |
823 | movl %ecx, (%rdi) |
824 | movl %ecx, -3(%rdi, %rdx) |
825 | # ifdef USE_AS_STPCPY |
826 | ret |
827 | # endif |
828 | |
829 | L(ret_4_7): |
830 | # ifdef USE_AS_STPCPY |
831 | L(ret_8_15): |
832 | movl %edx, %eax |
833 | adcq %rdi, %rax |
834 | # endif |
835 | ret |
836 | |
837 | .p2align 4,, 4 |
838 | L(zfill_less_3): |
839 | testl %edx, %edx |
840 | jz L(zfill_1) |
841 | movw %cx, (%rdi) |
842 | L(zfill_1): |
843 | movb %cl, (%rdi, %rdx) |
844 | ret |
845 | |
846 | .p2align 4,, 8 |
847 | L(copy_0_3): |
848 | vmovd %VMM_128(0), %r8d |
849 | testl %edx, %edx |
850 | jz L(copy_1) |
851 | movw %r8w, (%rdi) |
852 | cmpl %ecx, %edx |
853 | ja L(zfill_from_1) |
854 | movzbl (%rsi, %rdx), %r8d |
855 | # ifdef USE_AS_STPCPY |
856 | movl %edx, %eax |
857 | adcq %rdi, %rax |
858 | movb %r8b, (%rdi, %rdx) |
859 | ret |
860 | # endif |
861 | |
862 | L(copy_1): |
863 | # ifdef USE_AS_STPCPY |
864 | movl %edx, %eax |
865 | cmpl %ecx, %edx |
866 | adcq %rdi, %rax |
867 | # endif |
868 | # ifdef USE_AS_WCSCPY |
869 | vmovd %VMM_128(0), (%rdi) |
870 | # else |
871 | movb %r8b, (%rdi, %rdx) |
872 | # endif |
873 | ret |
874 | # endif |
875 | |
876 | |
877 | # ifndef USE_AS_WCSCPY |
878 | .p2align 4,, 8 |
879 | L(zfill_from_1): |
880 | # ifdef USE_AS_STPCPY |
881 | leaq (%rdi, %rcx), %rax |
882 | # endif |
883 | movw $0, -1(%rdi, %rdx) |
884 | ret |
885 | # endif |
886 | |
887 | .p2align 4,, 4 |
888 | L(zero_len): |
889 | incq %rdx |
890 | jne L(best_effort_strncpy) |
891 | movq %rdi, %rax |
892 | ret |
893 | # endif |
894 | |
895 | |
896 | .p2align 4,, 4 |
897 | .p2align 6,, 8 |
898 | L(page_cross): |
899 | movq %rsi, %rax |
900 | andq $(VEC_SIZE * -1), %rax |
901 | VPCMPEQ (%rax), %VZERO, %k0 |
902 | KMOV %k0, %VRCX |
903 | # ifdef USE_AS_WCSCPY |
904 | movl %esi, %r8d |
905 | shrl $2, %r8d |
906 | andl $(CHAR_PER_VEC - 1), %r8d |
907 | shrx %VR8, %VRCX, %VRCX |
908 | # else |
909 | shrx %VRSI, %VRCX, %VRCX |
910 | # endif |
911 | |
912 | /* Compute amount of bytes we checked. */ |
913 | subl %esi, %eax |
914 | andl $(VEC_SIZE - 1), %eax |
915 | # ifdef USE_AS_WCSCPY |
916 | shrl $2, %eax |
917 | # endif |
918 | |
919 | /* If rax > rdx then we are finishing the copy at the end of the |
920 | page. */ |
921 | cmpq %rax, %rdx |
922 | jb L(page_cross_small) |
923 | |
924 | |
925 | /* If rcx is non-zero then continue. */ |
926 | test %VRCX, %VRCX |
927 | jz L(page_cross_continue) |
928 | |
929 | /* We found zero-CHAR so need to copy then zfill (we know we |
930 | didn't cover all of length here). */ |
931 | bsf %VRCX, %VRCX |
932 | L(movsb_and_zfill): |
933 | incl %ecx |
934 | subq %rcx, %rdx |
935 | # ifdef USE_AS_STPCPY |
936 | leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax |
937 | # else |
938 | movq %rdi, %rax |
939 | # endif |
940 | |
941 | REP_MOVS |
942 | # ifdef USE_AS_WCSCPY |
943 | movl $0, (%rdi) |
944 | # else |
945 | movb $0, (%rdi) |
946 | # endif |
947 | jmp L(zfill_from_page_cross) |
948 | |
949 | L(page_cross_small): |
950 | tzcnt %VRCX, %VRCX |
951 | cmpl %ecx, %edx |
952 | jbe L(page_cross_copy_only) |
953 | |
954 | /* Do a zfill of the tail before copying. */ |
955 | movq %rdi, %r9 |
956 | xorl %eax, %eax |
957 | |
958 | movl %ecx, %r8d |
959 | |
960 | subl %ecx, %edx |
961 | leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi |
962 | movl %edx, %ecx |
963 | REP_STOS |
964 | movq %r9, %rdi |
965 | movl %r8d, %edx |
966 | L(page_cross_copy_only): |
967 | leal 1(%rdx), %ecx |
968 | # ifdef USE_AS_STPCPY |
969 | # ifdef USE_AS_WCSCPY |
970 | adcl $0, %edx |
971 | leaq (%rdi, %rdx, CHAR_SIZE), %rax |
972 | # else |
973 | movl %edx, %eax |
974 | adcq %rdi, %rax |
975 | # endif |
976 | # else |
977 | movq %rdi, %rax |
978 | # endif |
979 | REP_MOVS |
980 | ret |
981 | |
982 | |
983 | L(best_effort_strncpy): |
984 | movq %rdx, %rcx |
985 | xorl %eax, %eax |
986 | movq %rdi, %r8 |
987 | /* The length is >= 2^63. We very much so expect to segfault at |
988 | rep stos. If that doesn't happen then just strcpy to finish. |
989 | */ |
990 | REP_STOS |
991 | movq %r8, %rdi |
992 | jmp OVERFLOW_STRCPY |
993 | END(STRNCPY) |
994 | #endif |
995 | |