1/* strcat with AVX2
2 Copyright (C) 2011-2021 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#if IS_IN (libc)
21
22# include <sysdep.h>
23
24# ifndef STRCAT
25# define STRCAT __strcat_avx2
26# endif
27
28# define USE_AS_STRCAT
29
30/* Number of bytes in a vector register */
31# define VEC_SIZE 32
32
33# ifndef SECTION
34# define SECTION(p) p##.avx
35# endif
36
37 .section SECTION(.text),"ax",@progbits
38ENTRY (STRCAT)
39 mov %rdi, %r9
40# ifdef USE_AS_STRNCAT
41 mov %rdx, %r8
42# endif
43
44 xor %eax, %eax
45 mov %edi, %ecx
46 and $((VEC_SIZE * 4) - 1), %ecx
47 vpxor %xmm6, %xmm6, %xmm6
48 cmp $(VEC_SIZE * 3), %ecx
49 ja L(fourth_vector_boundary)
50 vpcmpeqb (%rdi), %ymm6, %ymm0
51 vpmovmskb %ymm0, %edx
52 test %edx, %edx
53 jnz L(exit_null_on_first_vector)
54 mov %rdi, %rax
55 and $-VEC_SIZE, %rax
56 jmp L(align_vec_size_start)
57L(fourth_vector_boundary):
58 mov %rdi, %rax
59 and $-VEC_SIZE, %rax
60 vpcmpeqb (%rax), %ymm6, %ymm0
61 mov $-1, %r10d
62 sub %rax, %rcx
63 shl %cl, %r10d
64 vpmovmskb %ymm0, %edx
65 and %r10d, %edx
66 jnz L(exit)
67
68L(align_vec_size_start):
69 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
70 vpmovmskb %ymm0, %edx
71 test %edx, %edx
72 jnz L(exit_null_on_second_vector)
73
74 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
75 vpmovmskb %ymm1, %edx
76 test %edx, %edx
77 jnz L(exit_null_on_third_vector)
78
79 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
80 vpmovmskb %ymm2, %edx
81 test %edx, %edx
82 jnz L(exit_null_on_fourth_vector)
83
84 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
85 vpmovmskb %ymm3, %edx
86 test %edx, %edx
87 jnz L(exit_null_on_fifth_vector)
88
89 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
90 add $(VEC_SIZE * 4), %rax
91 vpmovmskb %ymm0, %edx
92 test %edx, %edx
93 jnz L(exit_null_on_second_vector)
94
95 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
96 vpmovmskb %ymm1, %edx
97 test %edx, %edx
98 jnz L(exit_null_on_third_vector)
99
100 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
101 vpmovmskb %ymm2, %edx
102 test %edx, %edx
103 jnz L(exit_null_on_fourth_vector)
104
105 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
106 vpmovmskb %ymm3, %edx
107 test %edx, %edx
108 jnz L(exit_null_on_fifth_vector)
109
110 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
111 add $(VEC_SIZE * 4), %rax
112 vpmovmskb %ymm0, %edx
113 test %edx, %edx
114 jnz L(exit_null_on_second_vector)
115
116 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
117 vpmovmskb %ymm1, %edx
118 test %edx, %edx
119 jnz L(exit_null_on_third_vector)
120
121 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
122 vpmovmskb %ymm2, %edx
123 test %edx, %edx
124 jnz L(exit_null_on_fourth_vector)
125
126 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
127 vpmovmskb %ymm3, %edx
128 test %edx, %edx
129 jnz L(exit_null_on_fifth_vector)
130
131 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
132 add $(VEC_SIZE * 4), %rax
133 vpmovmskb %ymm0, %edx
134 test %edx, %edx
135 jnz L(exit_null_on_second_vector)
136
137 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
138 vpmovmskb %ymm1, %edx
139 test %edx, %edx
140 jnz L(exit_null_on_third_vector)
141
142 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
143 vpmovmskb %ymm2, %edx
144 test %edx, %edx
145 jnz L(exit_null_on_fourth_vector)
146
147 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
148 vpmovmskb %ymm3, %edx
149 test %edx, %edx
150 jnz L(exit_null_on_fifth_vector)
151
152 test $((VEC_SIZE * 4) - 1), %rax
153 jz L(align_four_vec_loop)
154
155 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
156 add $(VEC_SIZE * 5), %rax
157 vpmovmskb %ymm0, %edx
158 test %edx, %edx
159 jnz L(exit)
160
161 test $((VEC_SIZE * 4) - 1), %rax
162 jz L(align_four_vec_loop)
163
164 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
165 add $VEC_SIZE, %rax
166 vpmovmskb %ymm1, %edx
167 test %edx, %edx
168 jnz L(exit)
169
170 test $((VEC_SIZE * 4) - 1), %rax
171 jz L(align_four_vec_loop)
172
173 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
174 add $VEC_SIZE, %rax
175 vpmovmskb %ymm2, %edx
176 test %edx, %edx
177 jnz L(exit)
178
179 test $((VEC_SIZE * 4) - 1), %rax
180 jz L(align_four_vec_loop)
181
182 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
183 add $VEC_SIZE, %rax
184 vpmovmskb %ymm3, %edx
185 test %edx, %edx
186 jnz L(exit)
187
188 add $VEC_SIZE, %rax
189
190 .p2align 4
191L(align_four_vec_loop):
192 vmovaps (%rax), %ymm4
193 vpminub VEC_SIZE(%rax), %ymm4, %ymm4
194 vmovaps (VEC_SIZE * 2)(%rax), %ymm5
195 vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
196 add $(VEC_SIZE * 4), %rax
197 vpminub %ymm4, %ymm5, %ymm5
198 vpcmpeqb %ymm5, %ymm6, %ymm5
199 vpmovmskb %ymm5, %edx
200 test %edx, %edx
201 jz L(align_four_vec_loop)
202
203 vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
204 sub $(VEC_SIZE * 5), %rax
205 vpmovmskb %ymm0, %edx
206 test %edx, %edx
207 jnz L(exit_null_on_second_vector)
208
209 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
210 vpmovmskb %ymm1, %edx
211 test %edx, %edx
212 jnz L(exit_null_on_third_vector)
213
214 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
215 vpmovmskb %ymm2, %edx
216 test %edx, %edx
217 jnz L(exit_null_on_fourth_vector)
218
219 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
220 vpmovmskb %ymm3, %edx
221 sub %rdi, %rax
222 bsf %rdx, %rdx
223 add %rdx, %rax
224 add $(VEC_SIZE * 4), %rax
225 jmp L(StartStrcpyPart)
226
227 .p2align 4
228L(exit):
229 sub %rdi, %rax
230L(exit_null_on_first_vector):
231 bsf %rdx, %rdx
232 add %rdx, %rax
233 jmp L(StartStrcpyPart)
234
235 .p2align 4
236L(exit_null_on_second_vector):
237 sub %rdi, %rax
238 bsf %rdx, %rdx
239 add %rdx, %rax
240 add $VEC_SIZE, %rax
241 jmp L(StartStrcpyPart)
242
243 .p2align 4
244L(exit_null_on_third_vector):
245 sub %rdi, %rax
246 bsf %rdx, %rdx
247 add %rdx, %rax
248 add $(VEC_SIZE * 2), %rax
249 jmp L(StartStrcpyPart)
250
251 .p2align 4
252L(exit_null_on_fourth_vector):
253 sub %rdi, %rax
254 bsf %rdx, %rdx
255 add %rdx, %rax
256 add $(VEC_SIZE * 3), %rax
257 jmp L(StartStrcpyPart)
258
259 .p2align 4
260L(exit_null_on_fifth_vector):
261 sub %rdi, %rax
262 bsf %rdx, %rdx
263 add %rdx, %rax
264 add $(VEC_SIZE * 4), %rax
265
266 .p2align 4
267L(StartStrcpyPart):
268 lea (%r9, %rax), %rdi
269 mov %rsi, %rcx
270 mov %r9, %rax /* save result */
271
272# ifdef USE_AS_STRNCAT
273 test %r8, %r8
274 jz L(ExitZero)
275# define USE_AS_STRNCPY
276# endif
277
278# include "strcpy-avx2.S"
279#endif
280