1 | /* {wcs|str}ncat with 256/512-bit EVEX. |
2 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | /* Use evex-masked stores for small sizes. Turned off at the |
24 | moment. */ |
25 | # define USE_EVEX_MASKED_STORE 0 |
26 | |
27 | # include <sysdep.h> |
28 | |
29 | # ifndef VEC_SIZE |
30 | # include "x86-evex256-vecs.h" |
31 | # endif |
32 | |
33 | # ifndef STRNCAT |
34 | # define STRNCAT __strncat_evex |
35 | # endif |
36 | |
37 | |
38 | # ifdef USE_AS_WCSCPY |
39 | # define MOVCHAR movl |
40 | # define VMOVU_MASK vmovdqu32 |
41 | # define VPMIN vpminud |
42 | # define VPTESTN vptestnmd |
43 | # define VPTEST vptestmd |
44 | # define VPCMPEQ vpcmpeqd |
45 | # define CHAR_SIZE 4 |
46 | |
47 | # define REP_MOVS rep movsd |
48 | |
49 | # define VMASK_REG VR10 |
50 | # define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst |
51 | |
52 | # define USE_WIDE_CHAR |
53 | # else |
54 | # define MOVCHAR movb |
55 | # define VMOVU_MASK vmovdqu8 |
56 | # define VPMIN vpminub |
57 | # define VPTESTN vptestnmb |
58 | # define VPTEST vptestmb |
59 | # define VPCMPEQ vpcmpeqb |
60 | # define CHAR_SIZE 1 |
61 | |
62 | # define REP_MOVS rep movsb |
63 | |
64 | # define VMASK_REG VRCX |
65 | # define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst |
66 | |
67 | # endif |
68 | |
69 | # include "strncpy-or-cat-overflow-def.h" |
70 | |
71 | # include "reg-macros.h" |
72 | |
73 | |
74 | # define VZERO VMM(7) |
75 | # define VZERO_128 VMM_128(7) |
76 | |
77 | # define PAGE_SIZE 4096 |
78 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
79 | |
80 | .section SECTION(.text), "ax" , @progbits |
81 | ENTRY(STRNCAT) |
82 | # ifdef __ILP32__ |
83 | /* Clear the upper 32 bits. */ |
84 | movl %edx, %edx |
85 | # endif |
86 | |
87 | movq %rdi, %rax |
88 | |
89 | /* NB: It's safe to filter out zero-length strings WITHOUT |
90 | setting null-term. Destination MUST be a null-terminated |
91 | string so essentially the work is already done. */ |
92 | # ifdef USE_AS_WCSCPY |
93 | leaq -1(%rdx), %rcx |
94 | shrq $56, %rcx |
95 | jnz L(zero_len) |
96 | # else |
97 | test %rdx, %rdx |
98 | jle L(zero_len) |
99 | # endif |
100 | |
101 | # include "strcat-strlen-evex.h.S" |
102 | |
103 | movl %esi, %ecx |
104 | andl $(PAGE_SIZE - 1), %ecx |
105 | cmpl $(PAGE_SIZE - VEC_SIZE), %ecx |
106 | ja L(page_cross) |
107 | L(page_cross_continue): |
108 | VMOVU (%rsi), %VMM(0) |
109 | VPTESTN %VMM(0), %VMM(0), %k0 |
110 | |
111 | /* If USE_EVEX_MASK_STORE is enabled then we just handle length |
112 | <= CHAR_PER_VEC with masked instructions (which have |
113 | potential for dramatically bad perf if dst splits a page and |
114 | is not in the TLB). */ |
115 | # if USE_EVEX_MASKED_STORE |
116 | KMOV %k0, %VRCX |
117 | FIND_FIRST_ONE (VRCX, VR8) |
118 | cmpq %r8, %rdx |
119 | jbe L(less_1x_vec) |
120 | |
121 | test %VRCX, %VRCX |
122 | jz L(more_1x_vec) |
123 | |
124 | blsmsk %VRCX, %VRCX |
125 | KMOV %VRCX, %k1 |
126 | VMOVU_MASK %VMM(0), (%rdi){%k1} |
127 | ret |
128 | |
129 | L(less_1x_vec): |
130 | mov $-1, %VRCX |
131 | bzhi %VRDX, %VRCX, %VRCX |
132 | KMOV %VRCX, %k1 |
133 | MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) |
134 | VMOVU_MASK %VMM(0), (%rdi){%k1} |
135 | |
136 | ret |
137 | # else |
138 | KMOV %k0, %VMASK_REG |
139 | /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf |
140 | %VMASK_REG, %VRCX` for wcsncat. */ |
141 | FIND_FIRST_ONE (VMASK_REG, VRCX) |
142 | cmpq %rcx, %rdx |
143 | jbe L(less_1x_vec) |
144 | |
145 | /* If there were no zero-CHARs (rcx was zero before |
146 | FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ |
147 | cmpl $CHAR_PER_VEC, %ecx |
148 | je L(more_1x_vec) |
149 | |
150 | movl %ecx, %edx |
151 | |
152 | L(less_1x_vec): |
153 | # if VEC_SIZE == 64 |
154 | cmpl $(32 / CHAR_SIZE), %edx |
155 | jae L(copy_32_63) |
156 | # endif |
157 | |
158 | cmpl $(16 / CHAR_SIZE), %edx |
159 | jae L(copy_16_31) |
160 | |
161 | |
162 | cmpl $(8 / CHAR_SIZE), %edx |
163 | jae L(copy_8_15) |
164 | |
165 | # ifdef USE_AS_WCSCPY |
166 | vmovd %VMM_128(0), (%rdi) |
167 | MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) |
168 | ret |
169 | # else |
170 | |
171 | cmpl $4, %edx |
172 | jae L(copy_4_7) |
173 | |
174 | movzbl (%rsi), %ecx |
175 | cmpl $1, %edx |
176 | jbe L(set_null_term) |
177 | |
178 | movzwl 1(%rsi), %esi |
179 | movw %si, 1(%rdi) |
180 | |
181 | .p2align 4,, 1 |
182 | L(set_null_term): |
183 | movb %cl, (%rdi) |
184 | MOVCHAR $0, (%rdi, %rdx) |
185 | ret |
186 | # endif |
187 | |
188 | # if VEC_SIZE == 64 |
189 | .p2align 4,, 6 |
190 | L(copy_32_63): |
191 | VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) |
192 | VMOVU %VMM_256(0), (%rdi) |
193 | VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE) |
194 | MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) |
195 | ret |
196 | # endif |
197 | .p2align 4,, 6 |
198 | L(copy_16_31): |
199 | /* Use xmm1 explicitly here as it won't require a `vzeroupper` |
200 | and will save code size. */ |
201 | vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1 |
202 | VMOVU %VMM_128(0), (%rdi) |
203 | vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE) |
204 | MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) |
205 | ret |
206 | |
207 | .p2align 4,, 2 |
208 | L(copy_8_15): |
209 | movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx |
210 | vmovq %VMM_128(0), (%rdi) |
211 | movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE) |
212 | MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) |
213 | ret |
214 | |
215 | # ifndef USE_AS_WCSCPY |
216 | .p2align 4,, 12 |
217 | L(copy_4_7): |
218 | movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx |
219 | vmovd %VMM_128(0), (%rdi) |
220 | movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE) |
221 | MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) |
222 | ret |
223 | # endif |
224 | |
225 | # endif |
226 | .p2align 4,, 4 |
227 | L(zero_len): |
228 | # ifdef USE_AS_WCSCPY |
229 | test %rdx, %rdx |
230 | # endif |
231 | jne OVERFLOW_STRCAT |
232 | ret |
233 | |
234 | .p2align 4,, 8 |
235 | L(more_1x_vec): |
236 | VMOVU %VMM(0), (%rdi) |
237 | |
238 | /* We are going to align rsi here so will need to be able to re- |
239 | adjust rdi/rdx afterwords. NB: We filtered out huge lengths |
240 | so rsi + rdx * CHAR_SIZE cannot overflow. */ |
241 | |
242 | leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx |
243 | subq %rsi, %rdi |
244 | andq $-(VEC_SIZE), %rsi |
245 | L(loop_last_4x_vec): |
246 | addq %rsi, %rdi |
247 | subq %rsi, %rdx |
248 | # ifdef USE_AS_WCSCPY |
249 | shrq $2, %rdx |
250 | # endif |
251 | |
252 | /* Will need this regardless. */ |
253 | VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) |
254 | VPTESTN %VMM(1), %VMM(1), %k0 |
255 | KMOV %k0, %VMASK_REG |
256 | |
257 | cmpq $(CHAR_PER_VEC * 2), %rdx |
258 | ja L(more_2x_vec) |
259 | |
260 | L(last_2x_vec): |
261 | FIND_FIRST_ONE (VMASK_REG, VRCX) |
262 | cmpl %ecx, %edx |
263 | jbe L(ret_vec_x1_len) |
264 | |
265 | /* If there were no zero-CHARs (rcx was zero before |
266 | FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ |
267 | cmpl $CHAR_PER_VEC, %ecx |
268 | jne L(ret_vec_x1) |
269 | |
270 | VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) |
271 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
272 | VPTESTN %VMM(2), %VMM(2), %k0 |
273 | KMOV %k0, %VRCX |
274 | addl $-CHAR_PER_VEC, %edx |
275 | bzhi %VRDX, %VRCX, %VR8 |
276 | jz L(ret_vec_x2_len) |
277 | L(ret_vec_x2): |
278 | bsf %VRCX, %VRDX |
279 | L(ret_vec_x2_len): |
280 | VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
281 | MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) |
282 | VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) |
283 | ret |
284 | |
285 | .p2align 4,, 4 |
286 | L(ret_vec_x1_len): |
287 | movl %edx, %ecx |
288 | L(ret_vec_x1): |
289 | VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) |
290 | MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE) |
291 | VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) |
292 | VZEROUPPER_RETURN |
293 | |
294 | |
295 | .p2align 4,, 8 |
296 | L(last_4x_vec): |
297 | addl $-(CHAR_PER_VEC * 4), %edx |
298 | VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) |
299 | VPTESTN %VMM(1), %VMM(1), %k0 |
300 | KMOV %k0, %VMASK_REG |
301 | subq $-(VEC_SIZE * 4), %rsi |
302 | subq $-(VEC_SIZE * 4), %rdi |
303 | cmpl $(CHAR_PER_VEC * 2), %edx |
304 | jbe L(last_2x_vec) |
305 | .p2align 4,, 8 |
306 | L(more_2x_vec): |
307 | # ifdef USE_AS_WCSCPY |
308 | xorl %ecx, %ecx |
309 | # endif |
310 | bsf %VMASK_REG, %VRCX |
311 | jnz L(ret_vec_x1) |
312 | |
313 | VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) |
314 | VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) |
315 | VPTESTN %VMM(2), %VMM(2), %k0 |
316 | KMOV %k0, %VRCX |
317 | test %VRCX, %VRCX |
318 | jnz L(ret_vec_x2) |
319 | |
320 | VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) |
321 | VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) |
322 | VPTESTN %VMM(3), %VMM(3), %k0 |
323 | KMOV %k0, %VMASK_REG |
324 | |
325 | cmpq $(CHAR_PER_VEC * 4), %rdx |
326 | ja L(more_4x_vec) |
327 | |
328 | /* Adjust length before going to L(ret_vec_x3_len) or |
329 | L(ret_vec_x3). */ |
330 | addl $(CHAR_PER_VEC * -2), %edx |
331 | |
332 | FIND_FIRST_ONE (VMASK_REG, VRCX) |
333 | cmpl %ecx, %edx |
334 | jbe L(ret_vec_x3_len) |
335 | |
336 | /* If there were no zero-CHARs (rcx was zero before |
337 | FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ |
338 | cmpl $CHAR_PER_VEC, %ecx |
339 | jne L(ret_vec_x3) |
340 | |
341 | VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) |
342 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
343 | VPTESTN %VMM(4), %VMM(4), %k0 |
344 | KMOV %k0, %VRCX |
345 | addl $-CHAR_PER_VEC, %edx |
346 | bzhi %VRDX, %VRCX, %VR8 |
347 | jz L(ret_vec_x4_len) |
348 | L(ret_vec_x4): |
349 | bsf %VRCX, %VRDX |
350 | L(ret_vec_x4_len): |
351 | VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) |
352 | MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE) |
353 | VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) |
354 | ret |
355 | |
356 | .p2align 4,, 4 |
357 | L(ret_vec_x3_len): |
358 | movl %edx, %ecx |
359 | L(ret_vec_x3): |
360 | VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) |
361 | MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) |
362 | VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) |
363 | ret |
364 | |
365 | .p2align 4,, 8 |
366 | L(more_4x_vec): |
367 | # ifdef USE_AS_WCSCPY |
368 | xorl %ecx, %ecx |
369 | # endif |
370 | bsf %VMASK_REG, %VRCX |
371 | jnz L(ret_vec_x3) |
372 | |
373 | VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) |
374 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
375 | VPTESTN %VMM(4), %VMM(4), %k0 |
376 | KMOV %k0, %VRCX |
377 | test %VRCX, %VRCX |
378 | jnz L(ret_vec_x4) |
379 | |
380 | VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) |
381 | |
382 | /* Check if we are near the end before aligning. */ |
383 | cmpq $(CHAR_PER_VEC * 8), %rdx |
384 | jbe L(last_4x_vec) |
385 | |
386 | |
387 | /* Add rsi to rdx (length) before aligning rsi. NB: Since we |
388 | filtered out huge lengths this cannot overflow. */ |
389 | # ifdef USE_AS_WCSCPY |
390 | leaq (%rsi, %rdx, CHAR_SIZE), %rdx |
391 | # else |
392 | addq %rsi, %rdx |
393 | # endif |
394 | |
395 | /* Subtract rsi from rdi before aligning (add back will have |
396 | correct rdi for aligned rsi). */ |
397 | subq %rsi, %rdi |
398 | subq $-(VEC_SIZE * 5), %rsi |
399 | andq $(VEC_SIZE * -4), %rsi |
400 | |
401 | /* Load first half of the loop before entry. */ |
402 | VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) |
403 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) |
404 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) |
405 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) |
406 | |
407 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
408 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
409 | VPTESTN %VMM(4), %VMM(4), %k2 |
410 | VPTESTN %VMM(6), %VMM(6), %k4 |
411 | |
412 | /* Offset rsi by VEC_SIZE so that we can jump to |
413 | L(loop_last_4x_vec). */ |
414 | addq $-(VEC_SIZE), %rsi |
415 | KORTEST %k2, %k4 |
416 | jnz L(loop_4x_done) |
417 | |
418 | /* Store loop end in r9. */ |
419 | leaq -(VEC_SIZE * 5)(%rdx), %r9 |
420 | |
421 | .p2align 4,, 11 |
422 | L(loop_4x_vec): |
423 | VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) |
424 | VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) |
425 | VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) |
426 | VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) |
427 | |
428 | subq $(VEC_SIZE * -4), %rsi |
429 | cmpq %rsi, %r9 |
430 | jbe L(loop_last_4x_vec) |
431 | |
432 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) |
433 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) |
434 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) |
435 | VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) |
436 | |
437 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
438 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
439 | VPTESTN %VMM(4), %VMM(4), %k2 |
440 | VPTESTN %VMM(6), %VMM(6), %k4 |
441 | KORTEST %k2, %k4 |
442 | jz L(loop_4x_vec) |
443 | |
444 | L(loop_4x_done): |
445 | VPTESTN %VMM(0), %VMM(0), %k0 |
446 | KMOV %k0, %VRCX |
447 | /* Restore rdi (dst). */ |
448 | addq %rsi, %rdi |
449 | |
450 | /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so |
451 | test with bsf. */ |
452 | bsf %VRCX, %VRCX |
453 | jnz L(ret_vec_x1) |
454 | VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi) |
455 | |
456 | KMOV %k2, %VRCX |
457 | test %VRCX, %VRCX |
458 | jnz L(ret_vec_x2) |
459 | VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) |
460 | |
461 | VPTESTN %VMM(2), %VMM(2), %k0 |
462 | KMOV %k0, %VRCX |
463 | bsf %VRCX, %VRCX |
464 | jnz L(ret_vec_x3) |
465 | VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) |
466 | |
467 | KMOV %k4, %VRCX |
468 | bsf %VRCX, %VRCX |
469 | VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) |
470 | VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) |
471 | ret |
472 | |
473 | |
474 | .p2align 4,, 4 |
475 | L(page_cross): |
476 | movq %rsi, %r8 |
477 | andq $(VEC_SIZE * -1), %r8 |
478 | VPCMPEQ (%r8), %VZERO, %k0 |
479 | |
480 | # ifdef USE_AS_WCSCPY |
481 | KMOV %k0, %VR9 |
482 | shrl $2, %ecx |
483 | andl $(CHAR_PER_VEC - 1), %ecx |
484 | shrx %VRCX, %VR9, %VRCX |
485 | # else |
486 | KMOV %k0, %VRCX |
487 | shrx %VRSI, %VRCX, %VRCX |
488 | # endif |
489 | |
490 | subl %esi, %r8d |
491 | andl $(VEC_SIZE - 1), %r8d |
492 | # ifdef USE_AS_WCSCPY |
493 | shrl $2, %r8d |
494 | # endif |
495 | cmpq %r8, %rdx |
496 | jbe L(page_cross_small) |
497 | /* Optimizing more for space as this is very cold code. This |
498 | saves 2x cache lines. */ |
499 | |
500 | /* This adds once to the later result which will get correct |
501 | copy bounds. NB: this can never zero-out a non-zero RCX as |
502 | to be in the page cross case rsi cannot be aligned and we |
503 | already right-shift rcx by the misalignment. */ |
504 | shl %VRCX |
505 | jz L(page_cross_continue) |
506 | bsf %VRCX, %VRCX |
507 | REP_MOVS |
508 | ret |
509 | |
510 | L(page_cross_small): |
511 | tzcnt %VRCX, %VRCX |
512 | jz L(page_cross_setz) |
513 | cmpl %edx, %ecx |
514 | cmova %edx, %ecx |
515 | |
516 | # ifdef USE_AS_WCSCPY |
517 | rep movsd |
518 | # else |
519 | rep movsb |
520 | # endif |
521 | L(page_cross_setz): |
522 | MOVCHAR $0, (%rdi) |
523 | ret |
524 | END(STRNCAT) |
525 | #endif |
526 | |