strcat-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strcat-avx2.S]

1	/ strcat with AVX2*
2	Copyright (C) 2011-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	# include <sysdep.h>
22
23	# ifndef STRCAT
24	# define STRCAT __strcat_avx2
25	# endif
26
27	# define USE_AS_STRCAT
28
29	/ Number of bytes in a vector register /
30	# define VEC_SIZE 32
31
32	# ifndef SECTION
33	# define SECTION(p) p##.avx
34	# endif
35
36	.section SECTION(.text),"ax",@progbits
37	ENTRY (STRCAT)
38	mov %rdi, %r9
39	# ifdef USE_AS_STRNCAT
40	mov %rdx, %r8
41	# endif
42
43	xor %eax, %eax
44	mov %edi, %ecx
45	and $((VEC_SIZE * `4`) - `1`), %ecx
46	vpxor %xmm6, %xmm6, %xmm6
47	cmp $(VEC_SIZE * `3`), %ecx
48	ja L(fourth_vector_boundary)
49	vpcmpeqb (%rdi), %ymm6, %ymm0
50	vpmovmskb %ymm0, %edx
51	test %edx, %edx
52	jnz L(exit_null_on_first_vector)
53	mov %rdi, %rax
54	and $-VEC_SIZE, %rax
55	jmp L(align_vec_size_start)
56	L(fourth_vector_boundary):
57	mov %rdi, %rax
58	and $-VEC_SIZE, %rax
59	vpcmpeqb (%rax), %ymm6, %ymm0
60	mov $-`1`, %r10d
61	sub %rax, %rcx
62	shl %cl, %r10d
63	vpmovmskb %ymm0, %edx
64	and %r10d, %edx
65	jnz L(exit)
66
67	L(align_vec_size_start):
68	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
69	vpmovmskb %ymm0, %edx
70	test %edx, %edx
71	jnz L(exit_null_on_second_vector)
72
73	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
74	vpmovmskb %ymm1, %edx
75	test %edx, %edx
76	jnz L(exit_null_on_third_vector)
77
78	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
79	vpmovmskb %ymm2, %edx
80	test %edx, %edx
81	jnz L(exit_null_on_fourth_vector)
82
83	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
84	vpmovmskb %ymm3, %edx
85	test %edx, %edx
86	jnz L(exit_null_on_fifth_vector)
87
88	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
89	add $(VEC_SIZE * `4`), %rax
90	vpmovmskb %ymm0, %edx
91	test %edx, %edx
92	jnz L(exit_null_on_second_vector)
93
94	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
95	vpmovmskb %ymm1, %edx
96	test %edx, %edx
97	jnz L(exit_null_on_third_vector)
98
99	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
100	vpmovmskb %ymm2, %edx
101	test %edx, %edx
102	jnz L(exit_null_on_fourth_vector)
103
104	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
105	vpmovmskb %ymm3, %edx
106	test %edx, %edx
107	jnz L(exit_null_on_fifth_vector)
108
109	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
110	add $(VEC_SIZE * `4`), %rax
111	vpmovmskb %ymm0, %edx
112	test %edx, %edx
113	jnz L(exit_null_on_second_vector)
114
115	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
116	vpmovmskb %ymm1, %edx
117	test %edx, %edx
118	jnz L(exit_null_on_third_vector)
119
120	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
121	vpmovmskb %ymm2, %edx
122	test %edx, %edx
123	jnz L(exit_null_on_fourth_vector)
124
125	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
126	vpmovmskb %ymm3, %edx
127	test %edx, %edx
128	jnz L(exit_null_on_fifth_vector)
129
130	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
131	add $(VEC_SIZE * `4`), %rax
132	vpmovmskb %ymm0, %edx
133	test %edx, %edx
134	jnz L(exit_null_on_second_vector)
135
136	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
137	vpmovmskb %ymm1, %edx
138	test %edx, %edx
139	jnz L(exit_null_on_third_vector)
140
141	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
142	vpmovmskb %ymm2, %edx
143	test %edx, %edx
144	jnz L(exit_null_on_fourth_vector)
145
146	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
147	vpmovmskb %ymm3, %edx
148	test %edx, %edx
149	jnz L(exit_null_on_fifth_vector)
150
151	test $((VEC_SIZE * `4`) - `1`), %rax
152	jz L(align_four_vec_loop)
153
154	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
155	add $(VEC_SIZE * `5`), %rax
156	vpmovmskb %ymm0, %edx
157	test %edx, %edx
158	jnz L(exit)
159
160	test $((VEC_SIZE * `4`) - `1`), %rax
161	jz L(align_four_vec_loop)
162
163	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
164	add $VEC_SIZE, %rax
165	vpmovmskb %ymm1, %edx
166	test %edx, %edx
167	jnz L(exit)
168
169	test $((VEC_SIZE * `4`) - `1`), %rax
170	jz L(align_four_vec_loop)
171
172	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
173	add $VEC_SIZE, %rax
174	vpmovmskb %ymm2, %edx
175	test %edx, %edx
176	jnz L(exit)
177
178	test $((VEC_SIZE * `4`) - `1`), %rax
179	jz L(align_four_vec_loop)
180
181	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
182	add $VEC_SIZE, %rax
183	vpmovmskb %ymm3, %edx
184	test %edx, %edx
185	jnz L(exit)
186
187	add $VEC_SIZE, %rax
188
189	.p2align `4`
190	L(align_four_vec_loop):
191	vmovaps (%rax), %ymm4
192	vpminub VEC_SIZE(%rax), %ymm4, %ymm4
193	vmovaps (VEC_SIZE * `2`)(%rax), %ymm5
194	vpminub (VEC_SIZE * `3`)(%rax), %ymm5, %ymm5
195	add $(VEC_SIZE * `4`), %rax
196	vpminub %ymm4, %ymm5, %ymm5
197	vpcmpeqb %ymm5, %ymm6, %ymm5
198	vpmovmskb %ymm5, %edx
199	test %edx, %edx
200	jz L(align_four_vec_loop)
201
202	vpcmpeqb -(VEC_SIZE * `4`)(%rax), %ymm6, %ymm0
203	sub $(VEC_SIZE * `5`), %rax
204	vpmovmskb %ymm0, %edx
205	test %edx, %edx
206	jnz L(exit_null_on_second_vector)
207
208	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
209	vpmovmskb %ymm1, %edx
210	test %edx, %edx
211	jnz L(exit_null_on_third_vector)
212
213	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
214	vpmovmskb %ymm2, %edx
215	test %edx, %edx
216	jnz L(exit_null_on_fourth_vector)
217
218	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
219	vpmovmskb %ymm3, %edx
220	sub %rdi, %rax
221	bsf %rdx, %rdx
222	add %rdx, %rax
223	add $(VEC_SIZE * `4`), %rax
224	jmp L(StartStrcpyPart)
225
226	.p2align `4`
227	L(exit):
228	sub %rdi, %rax
229	L(exit_null_on_first_vector):
230	bsf %rdx, %rdx
231	add %rdx, %rax
232	jmp L(StartStrcpyPart)
233
234	.p2align `4`
235	L(exit_null_on_second_vector):
236	sub %rdi, %rax
237	bsf %rdx, %rdx
238	add %rdx, %rax
239	add $VEC_SIZE, %rax
240	jmp L(StartStrcpyPart)
241
242	.p2align `4`
243	L(exit_null_on_third_vector):
244	sub %rdi, %rax
245	bsf %rdx, %rdx
246	add %rdx, %rax
247	add $(VEC_SIZE * `2`), %rax
248	jmp L(StartStrcpyPart)
249
250	.p2align `4`
251	L(exit_null_on_fourth_vector):
252	sub %rdi, %rax
253	bsf %rdx, %rdx
254	add %rdx, %rax
255	add $(VEC_SIZE * `3`), %rax
256	jmp L(StartStrcpyPart)
257
258	.p2align `4`
259	L(exit_null_on_fifth_vector):
260	sub %rdi, %rax
261	bsf %rdx, %rdx
262	add %rdx, %rax
263	add $(VEC_SIZE * `4`), %rax
264
265	.p2align `4`
266	L(StartStrcpyPart):
267	lea (%r9, %rax), %rdi
268	mov %rsi, %rcx
269	mov %r9, %rax / save result /
270
271	# ifdef USE_AS_STRNCAT
272	test %r8, %r8
273	jz L(ExitZero)
274	# define USE_AS_STRNCPY
275	# endif
276
277	# include "strcpy-avx2.S"
278	#endif
279

Browse the source code of glibc/sysdeps/x86_64/multiarch/strcat-avx2.S