strcat-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strcat-avx2.S]

1	/ strcat with AVX2*
2	Copyright (C) 2011-2021 Free Software Foundation, Inc.
3	Contributed by Intel Corporation.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#if IS_IN (libc)
21
22	# include <sysdep.h>
23
24	# ifndef STRCAT
25	# define STRCAT __strcat_avx2
26	# endif
27
28	# define USE_AS_STRCAT
29
30	/ Number of bytes in a vector register /
31	# define VEC_SIZE 32
32
33	.section .text.avx,"ax",@progbits
34	ENTRY (STRCAT)
35	mov %rdi, %r9
36	# ifdef USE_AS_STRNCAT
37	mov %rdx, %r8
38	# endif
39
40	xor %eax, %eax
41	mov %edi, %ecx
42	and $((VEC_SIZE * `4`) - `1`), %ecx
43	vpxor %xmm6, %xmm6, %xmm6
44	cmp $(VEC_SIZE * `3`), %ecx
45	ja L(fourth_vector_boundary)
46	vpcmpeqb (%rdi), %ymm6, %ymm0
47	vpmovmskb %ymm0, %edx
48	test %edx, %edx
49	jnz L(exit_null_on_first_vector)
50	mov %rdi, %rax
51	and $-VEC_SIZE, %rax
52	jmp L(align_vec_size_start)
53	L(fourth_vector_boundary):
54	mov %rdi, %rax
55	and $-VEC_SIZE, %rax
56	vpcmpeqb (%rax), %ymm6, %ymm0
57	mov $-`1`, %r10d
58	sub %rax, %rcx
59	shl %cl, %r10d
60	vpmovmskb %ymm0, %edx
61	and %r10d, %edx
62	jnz L(exit)
63
64	L(align_vec_size_start):
65	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
66	vpmovmskb %ymm0, %edx
67	test %edx, %edx
68	jnz L(exit_null_on_second_vector)
69
70	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
71	vpmovmskb %ymm1, %edx
72	test %edx, %edx
73	jnz L(exit_null_on_third_vector)
74
75	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
76	vpmovmskb %ymm2, %edx
77	test %edx, %edx
78	jnz L(exit_null_on_fourth_vector)
79
80	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
81	vpmovmskb %ymm3, %edx
82	test %edx, %edx
83	jnz L(exit_null_on_fifth_vector)
84
85	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
86	add $(VEC_SIZE * `4`), %rax
87	vpmovmskb %ymm0, %edx
88	test %edx, %edx
89	jnz L(exit_null_on_second_vector)
90
91	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
92	vpmovmskb %ymm1, %edx
93	test %edx, %edx
94	jnz L(exit_null_on_third_vector)
95
96	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
97	vpmovmskb %ymm2, %edx
98	test %edx, %edx
99	jnz L(exit_null_on_fourth_vector)
100
101	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
102	vpmovmskb %ymm3, %edx
103	test %edx, %edx
104	jnz L(exit_null_on_fifth_vector)
105
106	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
107	add $(VEC_SIZE * `4`), %rax
108	vpmovmskb %ymm0, %edx
109	test %edx, %edx
110	jnz L(exit_null_on_second_vector)
111
112	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
113	vpmovmskb %ymm1, %edx
114	test %edx, %edx
115	jnz L(exit_null_on_third_vector)
116
117	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
118	vpmovmskb %ymm2, %edx
119	test %edx, %edx
120	jnz L(exit_null_on_fourth_vector)
121
122	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
123	vpmovmskb %ymm3, %edx
124	test %edx, %edx
125	jnz L(exit_null_on_fifth_vector)
126
127	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
128	add $(VEC_SIZE * `4`), %rax
129	vpmovmskb %ymm0, %edx
130	test %edx, %edx
131	jnz L(exit_null_on_second_vector)
132
133	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
134	vpmovmskb %ymm1, %edx
135	test %edx, %edx
136	jnz L(exit_null_on_third_vector)
137
138	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
139	vpmovmskb %ymm2, %edx
140	test %edx, %edx
141	jnz L(exit_null_on_fourth_vector)
142
143	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
144	vpmovmskb %ymm3, %edx
145	test %edx, %edx
146	jnz L(exit_null_on_fifth_vector)
147
148	test $((VEC_SIZE * `4`) - `1`), %rax
149	jz L(align_four_vec_loop)
150
151	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
152	add $(VEC_SIZE * `5`), %rax
153	vpmovmskb %ymm0, %edx
154	test %edx, %edx
155	jnz L(exit)
156
157	test $((VEC_SIZE * `4`) - `1`), %rax
158	jz L(align_four_vec_loop)
159
160	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
161	add $VEC_SIZE, %rax
162	vpmovmskb %ymm1, %edx
163	test %edx, %edx
164	jnz L(exit)
165
166	test $((VEC_SIZE * `4`) - `1`), %rax
167	jz L(align_four_vec_loop)
168
169	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
170	add $VEC_SIZE, %rax
171	vpmovmskb %ymm2, %edx
172	test %edx, %edx
173	jnz L(exit)
174
175	test $((VEC_SIZE * `4`) - `1`), %rax
176	jz L(align_four_vec_loop)
177
178	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
179	add $VEC_SIZE, %rax
180	vpmovmskb %ymm3, %edx
181	test %edx, %edx
182	jnz L(exit)
183
184	add $VEC_SIZE, %rax
185
186	.p2align `4`
187	L(align_four_vec_loop):
188	vmovaps (%rax), %ymm4
189	vpminub VEC_SIZE(%rax), %ymm4, %ymm4
190	vmovaps (VEC_SIZE * `2`)(%rax), %ymm5
191	vpminub (VEC_SIZE * `3`)(%rax), %ymm5, %ymm5
192	add $(VEC_SIZE * `4`), %rax
193	vpminub %ymm4, %ymm5, %ymm5
194	vpcmpeqb %ymm5, %ymm6, %ymm5
195	vpmovmskb %ymm5, %edx
196	test %edx, %edx
197	jz L(align_four_vec_loop)
198
199	vpcmpeqb -(VEC_SIZE * `4`)(%rax), %ymm6, %ymm0
200	sub $(VEC_SIZE * `5`), %rax
201	vpmovmskb %ymm0, %edx
202	test %edx, %edx
203	jnz L(exit_null_on_second_vector)
204
205	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
206	vpmovmskb %ymm1, %edx
207	test %edx, %edx
208	jnz L(exit_null_on_third_vector)
209
210	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
211	vpmovmskb %ymm2, %edx
212	test %edx, %edx
213	jnz L(exit_null_on_fourth_vector)
214
215	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
216	vpmovmskb %ymm3, %edx
217	sub %rdi, %rax
218	bsf %rdx, %rdx
219	add %rdx, %rax
220	add $(VEC_SIZE * `4`), %rax
221	jmp L(StartStrcpyPart)
222
223	.p2align `4`
224	L(exit):
225	sub %rdi, %rax
226	L(exit_null_on_first_vector):
227	bsf %rdx, %rdx
228	add %rdx, %rax
229	jmp L(StartStrcpyPart)
230
231	.p2align `4`
232	L(exit_null_on_second_vector):
233	sub %rdi, %rax
234	bsf %rdx, %rdx
235	add %rdx, %rax
236	add $VEC_SIZE, %rax
237	jmp L(StartStrcpyPart)
238
239	.p2align `4`
240	L(exit_null_on_third_vector):
241	sub %rdi, %rax
242	bsf %rdx, %rdx
243	add %rdx, %rax
244	add $(VEC_SIZE * `2`), %rax
245	jmp L(StartStrcpyPart)
246
247	.p2align `4`
248	L(exit_null_on_fourth_vector):
249	sub %rdi, %rax
250	bsf %rdx, %rdx
251	add %rdx, %rax
252	add $(VEC_SIZE * `3`), %rax
253	jmp L(StartStrcpyPart)
254
255	.p2align `4`
256	L(exit_null_on_fifth_vector):
257	sub %rdi, %rax
258	bsf %rdx, %rdx
259	add %rdx, %rax
260	add $(VEC_SIZE * `4`), %rax
261
262	.p2align `4`
263	L(StartStrcpyPart):
264	lea (%r9, %rax), %rdi
265	mov %rsi, %rcx
266	mov %r9, %rax / save result /
267
268	# ifdef USE_AS_STRNCAT
269	test %r8, %r8
270	jz L(ExitZero)
271	# define USE_AS_STRNCPY
272	# endif
273
274	# include "strcpy-avx2.S"
275	#endif
276

Browse the source code of glibc/sysdeps/x86_64/multiarch/strcat-avx2.S