1 | /* strncat with AVX2 |
2 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef VEC_SIZE |
26 | # include "x86-avx-vecs.h" |
27 | # endif |
28 | |
29 | # ifndef STRNCAT |
30 | # define STRNCAT __strncat_avx2 |
31 | # endif |
32 | |
33 | # ifdef USE_AS_WCSCPY |
34 | # define MOVCHAR movl |
35 | # define VPCMPEQ vpcmpeqd |
36 | # define VPMIN vpminud |
37 | # define CHAR_SIZE 4 |
38 | # else |
39 | # define MOVCHAR movb |
40 | # define VPCMPEQ vpcmpeqb |
41 | # define VPMIN vpminub |
42 | # define CHAR_SIZE 1 |
43 | # endif |
44 | |
45 | # include "strncpy-or-cat-overflow-def.h" |
46 | |
47 | # define PAGE_SIZE 4096 |
48 | |
49 | # define VZERO VMM(7) |
50 | # define VZERO_128 VMM_128(7) |
51 | |
52 | .section SECTION(.text), "ax" , @progbits |
53 | ENTRY(STRNCAT) |
54 | # ifdef __ILP32__ |
55 | /* Clear the upper 32 bits. */ |
56 | movl %edx, %edx |
57 | # endif |
58 | /* Filter zero length strings and very long strings. Zero |
59 | length strings just return, very long strings are handled by |
60 | using the non-length variant {wcs|str}cat. */ |
61 | movq %rdi, %rax |
62 | # ifdef USE_AS_WCSCPY |
63 | leaq -1(%rdx), %rcx |
64 | shr $56, %rcx |
65 | jnz L(zero_len) |
66 | salq $2, %rdx |
67 | # else |
68 | test %rdx, %rdx |
69 | jle L(zero_len) |
70 | # endif |
71 | vpxor %VZERO_128, %VZERO_128, %VZERO_128 |
72 | |
73 | # include "strcat-strlen-avx2.h.S" |
74 | |
75 | movl %esi, %ecx |
76 | andl $(PAGE_SIZE - 1), %ecx |
77 | cmpl $(PAGE_SIZE - VEC_SIZE), %ecx |
78 | ja L(page_cross) |
79 | L(page_cross_continue): |
80 | VMOVU (%rsi), %VMM(0) |
81 | VPCMPEQ %VMM(0), %VZERO, %VMM(6) |
82 | vpmovmskb %VMM(6), %ecx |
83 | |
84 | tzcnt %ecx, %r8d |
85 | cmpq %r8, %rdx |
86 | jbe L(less_1x_vec) |
87 | |
88 | testl %ecx, %ecx |
89 | jz L(more_1x_vec) |
90 | |
91 | /* Hoist this to save code size. */ |
92 | |
93 | movl %r8d, %edx |
94 | |
95 | L(less_1x_vec): |
96 | COND_VZEROUPPER |
97 | |
98 | cmpl $16, %edx |
99 | jae L(copy_16_31) |
100 | cmpl $8, %edx |
101 | jae L(copy_8_15) |
102 | |
103 | |
104 | # ifdef USE_AS_WCSCPY |
105 | vmovd %VMM_128(0), (%rdi) |
106 | MOVCHAR $0, (%rdi, %rdx) |
107 | ret |
108 | # else |
109 | cmpl $4, %edx |
110 | jae L(copy_4_7) |
111 | |
112 | movzbl (%rsi), %ecx |
113 | cmpl $1, %edx |
114 | jbe L(set_null_term) |
115 | |
116 | /* NB: make this `vmovw` if support for AVX512-FP16 is added. |
117 | */ |
118 | movzwl 1(%rsi), %esi |
119 | movw %si, 1(%rdi) |
120 | |
121 | .p2align 4,, 1 |
122 | L(set_null_term): |
123 | movb %cl, (%rdi) |
124 | MOVCHAR $0, (%rdi, %rdx) |
125 | ret |
126 | |
127 | .p2align 4,, 11 |
128 | L(copy_4_7): |
129 | movl -(4)(%rsi, %rdx), %ecx |
130 | vmovd %xmm0, (%rdi) |
131 | movl %ecx, -(4)(%rdi, %rdx) |
132 | MOVCHAR $0, (%rdi, %rdx) |
133 | ret |
134 | # endif |
135 | |
136 | |
137 | .p2align 4,, 10 |
138 | L(copy_16_31): |
139 | VMOVU -(16)(%rsi, %rdx), %xmm1 |
140 | VMOVU %xmm0, (%rdi) |
141 | VMOVU %xmm1, -(16)(%rdi, %rdx) |
142 | MOVCHAR $0, (%rdi, %rdx) |
143 | ret |
144 | |
145 | .p2align 4,, 10 |
146 | L(copy_8_15): |
147 | movq -(8)(%rsi, %rdx), %rcx |
148 | vmovq %xmm0, (%rdi) |
149 | movq %rcx, -(8)(%rdi, %rdx) |
150 | MOVCHAR $0, (%rdi, %rdx) |
151 | ret |
152 | |
153 | .p2align 4,, 8 |
154 | .p2align 6,, 14 |
155 | L(more_1x_vec): |
156 | VMOVU %VMM(0), (%rdi) |
157 | |
158 | /* Align rsi (src) and just rdx/rdi (length/dst). */ |
159 | addq %rsi, %rdx |
160 | subq %rsi, %rdi |
161 | orq $(VEC_SIZE - 1), %rsi |
162 | incq %rsi |
163 | addq %rsi, %rdi |
164 | L(loop_last_4x_vec): |
165 | subq %rsi, %rdx |
166 | VMOVA 0(%rsi), %VMM(1) |
167 | VPCMPEQ %VMM(1), %VZERO, %VMM(6) |
168 | vpmovmskb %VMM(6), %ecx |
169 | cmpq $(VEC_SIZE * 2), %rdx |
170 | ja L(more_2x_vec) |
171 | L(last_2x_vec): |
172 | tzcnt %ecx, %ecx |
173 | cmpl %ecx, %edx |
174 | jbe L(ret_vec_x1_len) |
175 | |
176 | cmpl $VEC_SIZE, %ecx |
177 | jnz L(ret_vec_x1) |
178 | |
179 | VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) |
180 | VMOVU %VMM(1), (%rdi) |
181 | VPCMPEQ %VMM(2), %VZERO, %VMM(6) |
182 | vpmovmskb %VMM(6), %ecx |
183 | addl $-VEC_SIZE, %edx |
184 | bzhil %edx, %ecx, %r8d |
185 | jz L(ret_vec_x2_len) |
186 | L(ret_vec_x2): |
187 | bsfl %ecx, %edx |
188 | L(ret_vec_x2_len): |
189 | VMOVU (%rsi, %rdx), %VMM(0) |
190 | MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx) |
191 | VMOVU %VMM(0), (%rdi, %rdx) |
192 | L(return_vzeroupper): |
193 | ZERO_UPPER_VEC_REGISTERS_RETURN |
194 | |
195 | |
196 | .p2align 4,, 12 |
197 | L(ret_vec_x1_len): |
198 | movl %edx, %ecx |
199 | L(ret_vec_x1): |
200 | VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1) |
201 | MOVCHAR $0, (%rdi, %rcx) |
202 | VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx) |
203 | VZEROUPPER_RETURN |
204 | |
205 | .p2align 4,, 8 |
206 | L(last_4x_vec): |
207 | subq $-(VEC_SIZE * 4), %rsi |
208 | VMOVA 0(%rsi), %VMM(1) |
209 | VPCMPEQ %VMM(1), %VZERO, %VMM(6) |
210 | vpmovmskb %VMM(6), %ecx |
211 | subq $-(VEC_SIZE * 4), %rdi |
212 | addl $-(VEC_SIZE * 4), %edx |
213 | cmpl $(VEC_SIZE * 2), %edx |
214 | jbe L(last_2x_vec) |
215 | .p2align 4,, 8 |
216 | L(more_2x_vec): |
217 | /* L(ret_vec_x1) expects ecx to have position of first match so |
218 | test with bsf. */ |
219 | bsfl %ecx, %ecx |
220 | jnz L(ret_vec_x1) |
221 | |
222 | VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) |
223 | VMOVU %VMM(1), (%rdi) |
224 | |
225 | VPCMPEQ %VMM(2), %VZERO, %VMM(6) |
226 | vpmovmskb %VMM(6), %ecx |
227 | testl %ecx, %ecx |
228 | jnz L(ret_vec_x2) |
229 | |
230 | |
231 | VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3) |
232 | VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi) |
233 | |
234 | VPCMPEQ %VMM(3), %VZERO, %VMM(6) |
235 | vpmovmskb %VMM(6), %ecx |
236 | |
237 | /* Check if length is greater than 4x VEC. */ |
238 | cmpq $(VEC_SIZE * 4), %rdx |
239 | ja L(more_4x_vec) |
240 | |
241 | addl $(VEC_SIZE * -2), %edx |
242 | |
243 | tzcnt %ecx, %ecx |
244 | cmpl %ecx, %edx |
245 | jbe L(ret_vec_x3_len) |
246 | |
247 | cmpl $VEC_SIZE, %ecx |
248 | jnz L(ret_vec_x3) |
249 | |
250 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4) |
251 | VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi) |
252 | VPCMPEQ %VMM(4), %VZERO, %VMM(6) |
253 | vpmovmskb %VMM(6), %ecx |
254 | addl $-VEC_SIZE, %edx |
255 | bzhil %edx, %ecx, %r8d |
256 | jz L(ret_vec_x4_len) |
257 | L(ret_vec_x4): |
258 | bsfl %ecx, %edx |
259 | L(ret_vec_x4_len): |
260 | VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0) |
261 | MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx) |
262 | VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx) |
263 | VZEROUPPER_RETURN |
264 | |
265 | .p2align 4,, 4 |
266 | L(ret_vec_x3_len): |
267 | movl %edx, %ecx |
268 | L(ret_vec_x3): |
269 | VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0) |
270 | MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx) |
271 | VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx) |
272 | VZEROUPPER_RETURN |
273 | |
274 | |
275 | .p2align 4,, 8 |
276 | L(more_4x_vec): |
277 | bsfl %ecx, %ecx |
278 | jnz L(ret_vec_x3) |
279 | |
280 | VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4) |
281 | VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) |
282 | VPCMPEQ %VMM(4), %VZERO, %VMM(6) |
283 | vpmovmskb %VMM(6), %ecx |
284 | testl %ecx, %ecx |
285 | jnz L(ret_vec_x4) |
286 | |
287 | VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi) |
288 | |
289 | |
290 | /* Recheck length before aligning. */ |
291 | cmpq $(VEC_SIZE * 8), %rdx |
292 | jbe L(last_4x_vec) |
293 | |
294 | /* Align rsi (src) and just rdx/rdi (length/dst). */ |
295 | addq %rsi, %rdx |
296 | subq %rsi, %rdi |
297 | subq $-(VEC_SIZE * 4), %rsi |
298 | andq $(VEC_SIZE * -4), %rsi |
299 | |
300 | /* Do first half of loop ahead of time so loop can just start by |
301 | storing. */ |
302 | VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) |
303 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) |
304 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) |
305 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) |
306 | |
307 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
308 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
309 | VPMIN %VMM(4), %VMM(6), %VMM(6) |
310 | VPCMPEQ %VMM(6), %VZERO, %VMM(6) |
311 | vpmovmskb %VMM(6), %r8d |
312 | addq %rsi, %rdi |
313 | testl %r8d, %r8d |
314 | jnz L(loop_4x_done) |
315 | |
316 | /* Use r9 for end of region before handling last 4x VEC |
317 | specially. */ |
318 | leaq -(VEC_SIZE * 4)(%rdx), %r9 |
319 | |
320 | .p2align 4,, 11 |
321 | L(loop_4x_vec): |
322 | |
323 | VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) |
324 | VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) |
325 | subq $(VEC_SIZE * -4), %rsi |
326 | VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) |
327 | VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) |
328 | |
329 | subq $(VEC_SIZE * -4), %rdi |
330 | cmpq %rsi, %r9 |
331 | jbe L(loop_last_4x_vec) |
332 | |
333 | VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) |
334 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) |
335 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) |
336 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) |
337 | |
338 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
339 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
340 | VPMIN %VMM(4), %VMM(6), %VMM(6) |
341 | VPCMPEQ %VMM(6), %VZERO, %VMM(6) |
342 | |
343 | vpmovmskb %VMM(6), %r8d |
344 | |
345 | testl %r8d, %r8d |
346 | jz L(loop_4x_vec) |
347 | |
348 | L(loop_4x_done): |
349 | VPCMPEQ %VMM(0), %VZERO, %VMM(6) |
350 | vpmovmskb %VMM(6), %ecx |
351 | /* L(ret_vec_x1) expects ecx to have position of first match so |
352 | test with bsf. */ |
353 | bsfl %ecx, %ecx |
354 | jnz L(ret_vec_x1) |
355 | VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) |
356 | |
357 | VPCMPEQ %VMM(1), %VZERO, %VMM(6) |
358 | vpmovmskb %VMM(6), %ecx |
359 | |
360 | testl %ecx, %ecx |
361 | jnz L(ret_vec_x2) |
362 | VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) |
363 | |
364 | VPCMPEQ %VMM(2), %VZERO, %VMM(6) |
365 | vpmovmskb %VMM(6), %ecx |
366 | bsfl %ecx, %ecx |
367 | jnz L(ret_vec_x3) |
368 | |
369 | VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) |
370 | bsfl %r8d, %r8d |
371 | VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1) |
372 | VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8) |
373 | VZEROUPPER_RETURN |
374 | |
375 | |
376 | |
377 | .p2align 4,, 4 |
378 | L(page_cross): |
379 | movq %rsi, %r8 |
380 | andq $(VEC_SIZE * -1), %r8 |
381 | |
382 | VPCMPEQ (%r8), %VZERO, %VMM(6) |
383 | |
384 | vpmovmskb %VMM(6), %ecx |
385 | shrxl %esi, %ecx, %ecx |
386 | |
387 | subl %esi, %r8d |
388 | andl $(VEC_SIZE - 1), %r8d |
389 | cmpq %r8, %rdx |
390 | jbe L(page_cross_small) |
391 | |
392 | /* Optimizing more aggressively for space as this is very cold |
393 | code. This saves 2x cache lines. */ |
394 | |
395 | /* This adds once to the later result which will get correct |
396 | copy bounds. NB: this can never zero-out a non-zero RCX as |
397 | to be in the page cross case rsi cannot be aligned and we |
398 | already right-shift rcx by the misalignment. */ |
399 | shll $CHAR_SIZE, %ecx |
400 | jz L(page_cross_continue) |
401 | bsfl %ecx, %ecx |
402 | rep movsb |
403 | VZEROUPPER_RETURN |
404 | |
405 | L(page_cross_small): |
406 | tzcntl %ecx, %ecx |
407 | jz L(page_cross_setz) |
408 | cmpl %edx, %ecx |
409 | cmova %edx, %ecx |
410 | rep movsb |
411 | L(page_cross_setz): |
412 | MOVCHAR $0, (%rdi) |
413 | VZEROUPPER_RETURN |
414 | L(zero_len): |
415 | # ifdef USE_AS_WCSCPY |
416 | test %rdx, %rdx |
417 | # endif |
418 | jnz OVERFLOW_STRCAT |
419 | ret |
420 | |
421 | |
422 | END(STRNCAT) |
423 | #endif |
424 | |