1/* strcat with AVX2
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCAT
24# define STRCAT __strcat_avx2
25# endif
26
27# define USE_AS_STRCAT
28
29/* Number of bytes in a vector register */
30# define VEC_SIZE 32
31
32# ifndef SECTION
33# define SECTION(p) p##.avx
34# endif
35
36 .section SECTION(.text),"ax",@progbits
37ENTRY (STRCAT)
38 mov %rdi, %r9
39# ifdef USE_AS_STRNCAT
40 mov %rdx, %r8
41# endif
42
43 xor %eax, %eax
44 mov %edi, %ecx
45 and $((VEC_SIZE * 4) - 1), %ecx
46 vpxor %xmm6, %xmm6, %xmm6
47 cmp $(VEC_SIZE * 3), %ecx
48 ja L(fourth_vector_boundary)
49 vpcmpeqb (%rdi), %ymm6, %ymm0
50 vpmovmskb %ymm0, %edx
51 test %edx, %edx
52 jnz L(exit_null_on_first_vector)
53 mov %rdi, %rax
54 and $-VEC_SIZE, %rax
55 jmp L(align_vec_size_start)
56L(fourth_vector_boundary):
57 mov %rdi, %rax
58 and $-VEC_SIZE, %rax
59 vpcmpeqb (%rax), %ymm6, %ymm0
60 mov $-1, %r10d
61 sub %rax, %rcx
62 shl %cl, %r10d
63 vpmovmskb %ymm0, %edx
64 and %r10d, %edx
65 jnz L(exit)
66
67L(align_vec_size_start):
68 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
69 vpmovmskb %ymm0, %edx
70 test %edx, %edx
71 jnz L(exit_null_on_second_vector)
72
73 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
74 vpmovmskb %ymm1, %edx
75 test %edx, %edx
76 jnz L(exit_null_on_third_vector)
77
78 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
79 vpmovmskb %ymm2, %edx
80 test %edx, %edx
81 jnz L(exit_null_on_fourth_vector)
82
83 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
84 vpmovmskb %ymm3, %edx
85 test %edx, %edx
86 jnz L(exit_null_on_fifth_vector)
87
88 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
89 add $(VEC_SIZE * 4), %rax
90 vpmovmskb %ymm0, %edx
91 test %edx, %edx
92 jnz L(exit_null_on_second_vector)
93
94 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
95 vpmovmskb %ymm1, %edx
96 test %edx, %edx
97 jnz L(exit_null_on_third_vector)
98
99 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
100 vpmovmskb %ymm2, %edx
101 test %edx, %edx
102 jnz L(exit_null_on_fourth_vector)
103
104 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
105 vpmovmskb %ymm3, %edx
106 test %edx, %edx
107 jnz L(exit_null_on_fifth_vector)
108
109 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
110 add $(VEC_SIZE * 4), %rax
111 vpmovmskb %ymm0, %edx
112 test %edx, %edx
113 jnz L(exit_null_on_second_vector)
114
115 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
116 vpmovmskb %ymm1, %edx
117 test %edx, %edx
118 jnz L(exit_null_on_third_vector)
119
120 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
121 vpmovmskb %ymm2, %edx
122 test %edx, %edx
123 jnz L(exit_null_on_fourth_vector)
124
125 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
126 vpmovmskb %ymm3, %edx
127 test %edx, %edx
128 jnz L(exit_null_on_fifth_vector)
129
130 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
131 add $(VEC_SIZE * 4), %rax
132 vpmovmskb %ymm0, %edx
133 test %edx, %edx
134 jnz L(exit_null_on_second_vector)
135
136 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
137 vpmovmskb %ymm1, %edx
138 test %edx, %edx
139 jnz L(exit_null_on_third_vector)
140
141 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
142 vpmovmskb %ymm2, %edx
143 test %edx, %edx
144 jnz L(exit_null_on_fourth_vector)
145
146 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
147 vpmovmskb %ymm3, %edx
148 test %edx, %edx
149 jnz L(exit_null_on_fifth_vector)
150
151 test $((VEC_SIZE * 4) - 1), %rax
152 jz L(align_four_vec_loop)
153
154 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
155 add $(VEC_SIZE * 5), %rax
156 vpmovmskb %ymm0, %edx
157 test %edx, %edx
158 jnz L(exit)
159
160 test $((VEC_SIZE * 4) - 1), %rax
161 jz L(align_four_vec_loop)
162
163 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
164 add $VEC_SIZE, %rax
165 vpmovmskb %ymm1, %edx
166 test %edx, %edx
167 jnz L(exit)
168
169 test $((VEC_SIZE * 4) - 1), %rax
170 jz L(align_four_vec_loop)
171
172 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
173 add $VEC_SIZE, %rax
174 vpmovmskb %ymm2, %edx
175 test %edx, %edx
176 jnz L(exit)
177
178 test $((VEC_SIZE * 4) - 1), %rax
179 jz L(align_four_vec_loop)
180
181 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
182 add $VEC_SIZE, %rax
183 vpmovmskb %ymm3, %edx
184 test %edx, %edx
185 jnz L(exit)
186
187 add $VEC_SIZE, %rax
188
189 .p2align 4
190L(align_four_vec_loop):
191 vmovaps (%rax), %ymm4
192 vpminub VEC_SIZE(%rax), %ymm4, %ymm4
193 vmovaps (VEC_SIZE * 2)(%rax), %ymm5
194 vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
195 add $(VEC_SIZE * 4), %rax
196 vpminub %ymm4, %ymm5, %ymm5
197 vpcmpeqb %ymm5, %ymm6, %ymm5
198 vpmovmskb %ymm5, %edx
199 test %edx, %edx
200 jz L(align_four_vec_loop)
201
202 vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
203 sub $(VEC_SIZE * 5), %rax
204 vpmovmskb %ymm0, %edx
205 test %edx, %edx
206 jnz L(exit_null_on_second_vector)
207
208 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
209 vpmovmskb %ymm1, %edx
210 test %edx, %edx
211 jnz L(exit_null_on_third_vector)
212
213 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
214 vpmovmskb %ymm2, %edx
215 test %edx, %edx
216 jnz L(exit_null_on_fourth_vector)
217
218 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
219 vpmovmskb %ymm3, %edx
220 sub %rdi, %rax
221 bsf %rdx, %rdx
222 add %rdx, %rax
223 add $(VEC_SIZE * 4), %rax
224 jmp L(StartStrcpyPart)
225
226 .p2align 4
227L(exit):
228 sub %rdi, %rax
229L(exit_null_on_first_vector):
230 bsf %rdx, %rdx
231 add %rdx, %rax
232 jmp L(StartStrcpyPart)
233
234 .p2align 4
235L(exit_null_on_second_vector):
236 sub %rdi, %rax
237 bsf %rdx, %rdx
238 add %rdx, %rax
239 add $VEC_SIZE, %rax
240 jmp L(StartStrcpyPart)
241
242 .p2align 4
243L(exit_null_on_third_vector):
244 sub %rdi, %rax
245 bsf %rdx, %rdx
246 add %rdx, %rax
247 add $(VEC_SIZE * 2), %rax
248 jmp L(StartStrcpyPart)
249
250 .p2align 4
251L(exit_null_on_fourth_vector):
252 sub %rdi, %rax
253 bsf %rdx, %rdx
254 add %rdx, %rax
255 add $(VEC_SIZE * 3), %rax
256 jmp L(StartStrcpyPart)
257
258 .p2align 4
259L(exit_null_on_fifth_vector):
260 sub %rdi, %rax
261 bsf %rdx, %rdx
262 add %rdx, %rax
263 add $(VEC_SIZE * 4), %rax
264
265 .p2align 4
266L(StartStrcpyPart):
267 lea (%r9, %rax), %rdi
268 mov %rsi, %rcx
269 mov %r9, %rax /* save result */
270
271# ifdef USE_AS_STRNCAT
272 test %r8, %r8
273 jz L(ExitZero)
274# define USE_AS_STRNCPY
275# endif
276
277# include "strcpy-avx2.S"
278#endif
279