1/* strcat with AVX2
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23
24# include <sysdep.h>
25
26# ifndef STRCAT
27# define STRCAT __strcat_avx2
28# endif
29
30# define USE_AS_STRCAT
31
32/* Number of bytes in a vector register */
33# define VEC_SIZE 32
34
35# ifndef SECTION
36# define SECTION(p) p##.avx
37# endif
38
39 .section SECTION(.text),"ax",@progbits
40ENTRY (STRCAT)
41 mov %rdi, %r9
42# ifdef USE_AS_STRNCAT
43 mov %rdx, %r8
44# endif
45
46 xor %eax, %eax
47 mov %edi, %ecx
48 and $((VEC_SIZE * 4) - 1), %ecx
49 vpxor %xmm6, %xmm6, %xmm6
50 cmp $(VEC_SIZE * 3), %ecx
51 ja L(fourth_vector_boundary)
52 vpcmpeqb (%rdi), %ymm6, %ymm0
53 vpmovmskb %ymm0, %edx
54 test %edx, %edx
55 jnz L(exit_null_on_first_vector)
56 mov %rdi, %rax
57 and $-VEC_SIZE, %rax
58 jmp L(align_vec_size_start)
59L(fourth_vector_boundary):
60 mov %rdi, %rax
61 and $-VEC_SIZE, %rax
62 vpcmpeqb (%rax), %ymm6, %ymm0
63 mov $-1, %r10d
64 sub %rax, %rcx
65 shl %cl, %r10d
66 vpmovmskb %ymm0, %edx
67 and %r10d, %edx
68 jnz L(exit)
69
70L(align_vec_size_start):
71 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
72 vpmovmskb %ymm0, %edx
73 test %edx, %edx
74 jnz L(exit_null_on_second_vector)
75
76 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
77 vpmovmskb %ymm1, %edx
78 test %edx, %edx
79 jnz L(exit_null_on_third_vector)
80
81 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
82 vpmovmskb %ymm2, %edx
83 test %edx, %edx
84 jnz L(exit_null_on_fourth_vector)
85
86 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
87 vpmovmskb %ymm3, %edx
88 test %edx, %edx
89 jnz L(exit_null_on_fifth_vector)
90
91 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
92 add $(VEC_SIZE * 4), %rax
93 vpmovmskb %ymm0, %edx
94 test %edx, %edx
95 jnz L(exit_null_on_second_vector)
96
97 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
98 vpmovmskb %ymm1, %edx
99 test %edx, %edx
100 jnz L(exit_null_on_third_vector)
101
102 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
103 vpmovmskb %ymm2, %edx
104 test %edx, %edx
105 jnz L(exit_null_on_fourth_vector)
106
107 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
108 vpmovmskb %ymm3, %edx
109 test %edx, %edx
110 jnz L(exit_null_on_fifth_vector)
111
112 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
113 add $(VEC_SIZE * 4), %rax
114 vpmovmskb %ymm0, %edx
115 test %edx, %edx
116 jnz L(exit_null_on_second_vector)
117
118 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
119 vpmovmskb %ymm1, %edx
120 test %edx, %edx
121 jnz L(exit_null_on_third_vector)
122
123 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
124 vpmovmskb %ymm2, %edx
125 test %edx, %edx
126 jnz L(exit_null_on_fourth_vector)
127
128 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
129 vpmovmskb %ymm3, %edx
130 test %edx, %edx
131 jnz L(exit_null_on_fifth_vector)
132
133 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
134 add $(VEC_SIZE * 4), %rax
135 vpmovmskb %ymm0, %edx
136 test %edx, %edx
137 jnz L(exit_null_on_second_vector)
138
139 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
140 vpmovmskb %ymm1, %edx
141 test %edx, %edx
142 jnz L(exit_null_on_third_vector)
143
144 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
145 vpmovmskb %ymm2, %edx
146 test %edx, %edx
147 jnz L(exit_null_on_fourth_vector)
148
149 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
150 vpmovmskb %ymm3, %edx
151 test %edx, %edx
152 jnz L(exit_null_on_fifth_vector)
153
154 test $((VEC_SIZE * 4) - 1), %rax
155 jz L(align_four_vec_loop)
156
157 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
158 add $(VEC_SIZE * 5), %rax
159 vpmovmskb %ymm0, %edx
160 test %edx, %edx
161 jnz L(exit)
162
163 test $((VEC_SIZE * 4) - 1), %rax
164 jz L(align_four_vec_loop)
165
166 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
167 add $VEC_SIZE, %rax
168 vpmovmskb %ymm1, %edx
169 test %edx, %edx
170 jnz L(exit)
171
172 test $((VEC_SIZE * 4) - 1), %rax
173 jz L(align_four_vec_loop)
174
175 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
176 add $VEC_SIZE, %rax
177 vpmovmskb %ymm2, %edx
178 test %edx, %edx
179 jnz L(exit)
180
181 test $((VEC_SIZE * 4) - 1), %rax
182 jz L(align_four_vec_loop)
183
184 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
185 add $VEC_SIZE, %rax
186 vpmovmskb %ymm3, %edx
187 test %edx, %edx
188 jnz L(exit)
189
190 add $VEC_SIZE, %rax
191
192 .p2align 4
193L(align_four_vec_loop):
194 vmovaps (%rax), %ymm4
195 vpminub VEC_SIZE(%rax), %ymm4, %ymm4
196 vmovaps (VEC_SIZE * 2)(%rax), %ymm5
197 vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
198 add $(VEC_SIZE * 4), %rax
199 vpminub %ymm4, %ymm5, %ymm5
200 vpcmpeqb %ymm5, %ymm6, %ymm5
201 vpmovmskb %ymm5, %edx
202 test %edx, %edx
203 jz L(align_four_vec_loop)
204
205 vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
206 sub $(VEC_SIZE * 5), %rax
207 vpmovmskb %ymm0, %edx
208 test %edx, %edx
209 jnz L(exit_null_on_second_vector)
210
211 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
212 vpmovmskb %ymm1, %edx
213 test %edx, %edx
214 jnz L(exit_null_on_third_vector)
215
216 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
217 vpmovmskb %ymm2, %edx
218 test %edx, %edx
219 jnz L(exit_null_on_fourth_vector)
220
221 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
222 vpmovmskb %ymm3, %edx
223 sub %rdi, %rax
224 bsf %rdx, %rdx
225 add %rdx, %rax
226 add $(VEC_SIZE * 4), %rax
227 jmp L(StartStrcpyPart)
228
229 .p2align 4
230L(exit):
231 sub %rdi, %rax
232L(exit_null_on_first_vector):
233 bsf %rdx, %rdx
234 add %rdx, %rax
235 jmp L(StartStrcpyPart)
236
237 .p2align 4
238L(exit_null_on_second_vector):
239 sub %rdi, %rax
240 bsf %rdx, %rdx
241 add %rdx, %rax
242 add $VEC_SIZE, %rax
243 jmp L(StartStrcpyPart)
244
245 .p2align 4
246L(exit_null_on_third_vector):
247 sub %rdi, %rax
248 bsf %rdx, %rdx
249 add %rdx, %rax
250 add $(VEC_SIZE * 2), %rax
251 jmp L(StartStrcpyPart)
252
253 .p2align 4
254L(exit_null_on_fourth_vector):
255 sub %rdi, %rax
256 bsf %rdx, %rdx
257 add %rdx, %rax
258 add $(VEC_SIZE * 3), %rax
259 jmp L(StartStrcpyPart)
260
261 .p2align 4
262L(exit_null_on_fifth_vector):
263 sub %rdi, %rax
264 bsf %rdx, %rdx
265 add %rdx, %rax
266 add $(VEC_SIZE * 4), %rax
267
268 .p2align 4
269L(StartStrcpyPart):
270 lea (%r9, %rax), %rdi
271 mov %rsi, %rcx
272 mov %r9, %rax /* save result */
273
274# ifdef USE_AS_STRNCAT
275 test %r8, %r8
276 jz L(ExitZero)
277# define USE_AS_STRNCPY
278# endif
279
280# include "strcpy-avx2.S"
281#endif
282