strcat-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strcat-avx2.S]

1	/ strcat with AVX2*
2	Copyright (C) 2011-2021 Free Software Foundation, Inc.
3	Contributed by Intel Corporation.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#if IS_IN (libc)
21
22	# include <sysdep.h>
23
24	# ifndef STRCAT
25	# define STRCAT __strcat_avx2
26	# endif
27
28	# define USE_AS_STRCAT
29
30	/ Number of bytes in a vector register /
31	# define VEC_SIZE 32
32
33	# ifndef SECTION
34	# define SECTION(p) p##.avx
35	# endif
36
37	.section SECTION(.text),"ax",@progbits
38	ENTRY (STRCAT)
39	mov %rdi, %r9
40	# ifdef USE_AS_STRNCAT
41	mov %rdx, %r8
42	# endif
43
44	xor %eax, %eax
45	mov %edi, %ecx
46	and $((VEC_SIZE * `4`) - `1`), %ecx
47	vpxor %xmm6, %xmm6, %xmm6
48	cmp $(VEC_SIZE * `3`), %ecx
49	ja L(fourth_vector_boundary)
50	vpcmpeqb (%rdi), %ymm6, %ymm0
51	vpmovmskb %ymm0, %edx
52	test %edx, %edx
53	jnz L(exit_null_on_first_vector)
54	mov %rdi, %rax
55	and $-VEC_SIZE, %rax
56	jmp L(align_vec_size_start)
57	L(fourth_vector_boundary):
58	mov %rdi, %rax
59	and $-VEC_SIZE, %rax
60	vpcmpeqb (%rax), %ymm6, %ymm0
61	mov $-`1`, %r10d
62	sub %rax, %rcx
63	shl %cl, %r10d
64	vpmovmskb %ymm0, %edx
65	and %r10d, %edx
66	jnz L(exit)
67
68	L(align_vec_size_start):
69	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
70	vpmovmskb %ymm0, %edx
71	test %edx, %edx
72	jnz L(exit_null_on_second_vector)
73
74	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
75	vpmovmskb %ymm1, %edx
76	test %edx, %edx
77	jnz L(exit_null_on_third_vector)
78
79	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
80	vpmovmskb %ymm2, %edx
81	test %edx, %edx
82	jnz L(exit_null_on_fourth_vector)
83
84	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
85	vpmovmskb %ymm3, %edx
86	test %edx, %edx
87	jnz L(exit_null_on_fifth_vector)
88
89	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
90	add $(VEC_SIZE * `4`), %rax
91	vpmovmskb %ymm0, %edx
92	test %edx, %edx
93	jnz L(exit_null_on_second_vector)
94
95	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
96	vpmovmskb %ymm1, %edx
97	test %edx, %edx
98	jnz L(exit_null_on_third_vector)
99
100	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
101	vpmovmskb %ymm2, %edx
102	test %edx, %edx
103	jnz L(exit_null_on_fourth_vector)
104
105	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
106	vpmovmskb %ymm3, %edx
107	test %edx, %edx
108	jnz L(exit_null_on_fifth_vector)
109
110	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
111	add $(VEC_SIZE * `4`), %rax
112	vpmovmskb %ymm0, %edx
113	test %edx, %edx
114	jnz L(exit_null_on_second_vector)
115
116	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
117	vpmovmskb %ymm1, %edx
118	test %edx, %edx
119	jnz L(exit_null_on_third_vector)
120
121	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
122	vpmovmskb %ymm2, %edx
123	test %edx, %edx
124	jnz L(exit_null_on_fourth_vector)
125
126	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
127	vpmovmskb %ymm3, %edx
128	test %edx, %edx
129	jnz L(exit_null_on_fifth_vector)
130
131	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
132	add $(VEC_SIZE * `4`), %rax
133	vpmovmskb %ymm0, %edx
134	test %edx, %edx
135	jnz L(exit_null_on_second_vector)
136
137	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
138	vpmovmskb %ymm1, %edx
139	test %edx, %edx
140	jnz L(exit_null_on_third_vector)
141
142	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
143	vpmovmskb %ymm2, %edx
144	test %edx, %edx
145	jnz L(exit_null_on_fourth_vector)
146
147	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
148	vpmovmskb %ymm3, %edx
149	test %edx, %edx
150	jnz L(exit_null_on_fifth_vector)
151
152	test $((VEC_SIZE * `4`) - `1`), %rax
153	jz L(align_four_vec_loop)
154
155	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
156	add $(VEC_SIZE * `5`), %rax
157	vpmovmskb %ymm0, %edx
158	test %edx, %edx
159	jnz L(exit)
160
161	test $((VEC_SIZE * `4`) - `1`), %rax
162	jz L(align_four_vec_loop)
163
164	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
165	add $VEC_SIZE, %rax
166	vpmovmskb %ymm1, %edx
167	test %edx, %edx
168	jnz L(exit)
169
170	test $((VEC_SIZE * `4`) - `1`), %rax
171	jz L(align_four_vec_loop)
172
173	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
174	add $VEC_SIZE, %rax
175	vpmovmskb %ymm2, %edx
176	test %edx, %edx
177	jnz L(exit)
178
179	test $((VEC_SIZE * `4`) - `1`), %rax
180	jz L(align_four_vec_loop)
181
182	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
183	add $VEC_SIZE, %rax
184	vpmovmskb %ymm3, %edx
185	test %edx, %edx
186	jnz L(exit)
187
188	add $VEC_SIZE, %rax
189
190	.p2align `4`
191	L(align_four_vec_loop):
192	vmovaps (%rax), %ymm4
193	vpminub VEC_SIZE(%rax), %ymm4, %ymm4
194	vmovaps (VEC_SIZE * `2`)(%rax), %ymm5
195	vpminub (VEC_SIZE * `3`)(%rax), %ymm5, %ymm5
196	add $(VEC_SIZE * `4`), %rax
197	vpminub %ymm4, %ymm5, %ymm5
198	vpcmpeqb %ymm5, %ymm6, %ymm5
199	vpmovmskb %ymm5, %edx
200	test %edx, %edx
201	jz L(align_four_vec_loop)
202
203	vpcmpeqb -(VEC_SIZE * `4`)(%rax), %ymm6, %ymm0
204	sub $(VEC_SIZE * `5`), %rax
205	vpmovmskb %ymm0, %edx
206	test %edx, %edx
207	jnz L(exit_null_on_second_vector)
208
209	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
210	vpmovmskb %ymm1, %edx
211	test %edx, %edx
212	jnz L(exit_null_on_third_vector)
213
214	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
215	vpmovmskb %ymm2, %edx
216	test %edx, %edx
217	jnz L(exit_null_on_fourth_vector)
218
219	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
220	vpmovmskb %ymm3, %edx
221	sub %rdi, %rax
222	bsf %rdx, %rdx
223	add %rdx, %rax
224	add $(VEC_SIZE * `4`), %rax
225	jmp L(StartStrcpyPart)
226
227	.p2align `4`
228	L(exit):
229	sub %rdi, %rax
230	L(exit_null_on_first_vector):
231	bsf %rdx, %rdx
232	add %rdx, %rax
233	jmp L(StartStrcpyPart)
234
235	.p2align `4`
236	L(exit_null_on_second_vector):
237	sub %rdi, %rax
238	bsf %rdx, %rdx
239	add %rdx, %rax
240	add $VEC_SIZE, %rax
241	jmp L(StartStrcpyPart)
242
243	.p2align `4`
244	L(exit_null_on_third_vector):
245	sub %rdi, %rax
246	bsf %rdx, %rdx
247	add %rdx, %rax
248	add $(VEC_SIZE * `2`), %rax
249	jmp L(StartStrcpyPart)
250
251	.p2align `4`
252	L(exit_null_on_fourth_vector):
253	sub %rdi, %rax
254	bsf %rdx, %rdx
255	add %rdx, %rax
256	add $(VEC_SIZE * `3`), %rax
257	jmp L(StartStrcpyPart)
258
259	.p2align `4`
260	L(exit_null_on_fifth_vector):
261	sub %rdi, %rax
262	bsf %rdx, %rdx
263	add %rdx, %rax
264	add $(VEC_SIZE * `4`), %rax
265
266	.p2align `4`
267	L(StartStrcpyPart):
268	lea (%r9, %rax), %rdi
269	mov %rsi, %rcx
270	mov %r9, %rax / save result /
271
272	# ifdef USE_AS_STRNCAT
273	test %r8, %r8
274	jz L(ExitZero)
275	# define USE_AS_STRNCPY
276	# endif
277
278	# include "strcpy-avx2.S"
279	#endif
280

Browse the source code of glibc/sysdeps/x86_64/multiarch/strcat-avx2.S