strcat-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strcat-avx2.S]

1	/ strcat with AVX2*
2	Copyright (C) 2011-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23
24	# include <sysdep.h>
25
26	# ifndef STRCAT
27	# define STRCAT __strcat_avx2
28	# endif
29
30	# define USE_AS_STRCAT
31
32	/ Number of bytes in a vector register /
33	# define VEC_SIZE 32
34
35	# ifndef SECTION
36	# define SECTION(p) p##.avx
37	# endif
38
39	.section SECTION(.text),"ax",@progbits
40	ENTRY (STRCAT)
41	mov %rdi, %r9
42	# ifdef USE_AS_STRNCAT
43	mov %rdx, %r8
44	# endif
45
46	xor %eax, %eax
47	mov %edi, %ecx
48	and $((VEC_SIZE * `4`) - `1`), %ecx
49	vpxor %xmm6, %xmm6, %xmm6
50	cmp $(VEC_SIZE * `3`), %ecx
51	ja L(fourth_vector_boundary)
52	vpcmpeqb (%rdi), %ymm6, %ymm0
53	vpmovmskb %ymm0, %edx
54	test %edx, %edx
55	jnz L(exit_null_on_first_vector)
56	mov %rdi, %rax
57	and $-VEC_SIZE, %rax
58	jmp L(align_vec_size_start)
59	L(fourth_vector_boundary):
60	mov %rdi, %rax
61	and $-VEC_SIZE, %rax
62	vpcmpeqb (%rax), %ymm6, %ymm0
63	mov $-`1`, %r10d
64	sub %rax, %rcx
65	shl %cl, %r10d
66	vpmovmskb %ymm0, %edx
67	and %r10d, %edx
68	jnz L(exit)
69
70	L(align_vec_size_start):
71	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
72	vpmovmskb %ymm0, %edx
73	test %edx, %edx
74	jnz L(exit_null_on_second_vector)
75
76	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
77	vpmovmskb %ymm1, %edx
78	test %edx, %edx
79	jnz L(exit_null_on_third_vector)
80
81	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
82	vpmovmskb %ymm2, %edx
83	test %edx, %edx
84	jnz L(exit_null_on_fourth_vector)
85
86	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
87	vpmovmskb %ymm3, %edx
88	test %edx, %edx
89	jnz L(exit_null_on_fifth_vector)
90
91	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
92	add $(VEC_SIZE * `4`), %rax
93	vpmovmskb %ymm0, %edx
94	test %edx, %edx
95	jnz L(exit_null_on_second_vector)
96
97	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
98	vpmovmskb %ymm1, %edx
99	test %edx, %edx
100	jnz L(exit_null_on_third_vector)
101
102	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
103	vpmovmskb %ymm2, %edx
104	test %edx, %edx
105	jnz L(exit_null_on_fourth_vector)
106
107	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
108	vpmovmskb %ymm3, %edx
109	test %edx, %edx
110	jnz L(exit_null_on_fifth_vector)
111
112	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
113	add $(VEC_SIZE * `4`), %rax
114	vpmovmskb %ymm0, %edx
115	test %edx, %edx
116	jnz L(exit_null_on_second_vector)
117
118	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
119	vpmovmskb %ymm1, %edx
120	test %edx, %edx
121	jnz L(exit_null_on_third_vector)
122
123	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
124	vpmovmskb %ymm2, %edx
125	test %edx, %edx
126	jnz L(exit_null_on_fourth_vector)
127
128	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
129	vpmovmskb %ymm3, %edx
130	test %edx, %edx
131	jnz L(exit_null_on_fifth_vector)
132
133	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
134	add $(VEC_SIZE * `4`), %rax
135	vpmovmskb %ymm0, %edx
136	test %edx, %edx
137	jnz L(exit_null_on_second_vector)
138
139	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
140	vpmovmskb %ymm1, %edx
141	test %edx, %edx
142	jnz L(exit_null_on_third_vector)
143
144	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
145	vpmovmskb %ymm2, %edx
146	test %edx, %edx
147	jnz L(exit_null_on_fourth_vector)
148
149	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
150	vpmovmskb %ymm3, %edx
151	test %edx, %edx
152	jnz L(exit_null_on_fifth_vector)
153
154	test $((VEC_SIZE * `4`) - `1`), %rax
155	jz L(align_four_vec_loop)
156
157	vpcmpeqb (VEC_SIZE * `5`)(%rax), %ymm6, %ymm0
158	add $(VEC_SIZE * `5`), %rax
159	vpmovmskb %ymm0, %edx
160	test %edx, %edx
161	jnz L(exit)
162
163	test $((VEC_SIZE * `4`) - `1`), %rax
164	jz L(align_four_vec_loop)
165
166	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
167	add $VEC_SIZE, %rax
168	vpmovmskb %ymm1, %edx
169	test %edx, %edx
170	jnz L(exit)
171
172	test $((VEC_SIZE * `4`) - `1`), %rax
173	jz L(align_four_vec_loop)
174
175	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
176	add $VEC_SIZE, %rax
177	vpmovmskb %ymm2, %edx
178	test %edx, %edx
179	jnz L(exit)
180
181	test $((VEC_SIZE * `4`) - `1`), %rax
182	jz L(align_four_vec_loop)
183
184	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
185	add $VEC_SIZE, %rax
186	vpmovmskb %ymm3, %edx
187	test %edx, %edx
188	jnz L(exit)
189
190	add $VEC_SIZE, %rax
191
192	.p2align `4`
193	L(align_four_vec_loop):
194	vmovaps (%rax), %ymm4
195	vpminub VEC_SIZE(%rax), %ymm4, %ymm4
196	vmovaps (VEC_SIZE * `2`)(%rax), %ymm5
197	vpminub (VEC_SIZE * `3`)(%rax), %ymm5, %ymm5
198	add $(VEC_SIZE * `4`), %rax
199	vpminub %ymm4, %ymm5, %ymm5
200	vpcmpeqb %ymm5, %ymm6, %ymm5
201	vpmovmskb %ymm5, %edx
202	test %edx, %edx
203	jz L(align_four_vec_loop)
204
205	vpcmpeqb -(VEC_SIZE * `4`)(%rax), %ymm6, %ymm0
206	sub $(VEC_SIZE * `5`), %rax
207	vpmovmskb %ymm0, %edx
208	test %edx, %edx
209	jnz L(exit_null_on_second_vector)
210
211	vpcmpeqb (VEC_SIZE * `2`)(%rax), %ymm6, %ymm1
212	vpmovmskb %ymm1, %edx
213	test %edx, %edx
214	jnz L(exit_null_on_third_vector)
215
216	vpcmpeqb (VEC_SIZE * `3`)(%rax), %ymm6, %ymm2
217	vpmovmskb %ymm2, %edx
218	test %edx, %edx
219	jnz L(exit_null_on_fourth_vector)
220
221	vpcmpeqb (VEC_SIZE * `4`)(%rax), %ymm6, %ymm3
222	vpmovmskb %ymm3, %edx
223	sub %rdi, %rax
224	bsf %rdx, %rdx
225	add %rdx, %rax
226	add $(VEC_SIZE * `4`), %rax
227	jmp L(StartStrcpyPart)
228
229	.p2align `4`
230	L(exit):
231	sub %rdi, %rax
232	L(exit_null_on_first_vector):
233	bsf %rdx, %rdx
234	add %rdx, %rax
235	jmp L(StartStrcpyPart)
236
237	.p2align `4`
238	L(exit_null_on_second_vector):
239	sub %rdi, %rax
240	bsf %rdx, %rdx
241	add %rdx, %rax
242	add $VEC_SIZE, %rax
243	jmp L(StartStrcpyPart)
244
245	.p2align `4`
246	L(exit_null_on_third_vector):
247	sub %rdi, %rax
248	bsf %rdx, %rdx
249	add %rdx, %rax
250	add $(VEC_SIZE * `2`), %rax
251	jmp L(StartStrcpyPart)
252
253	.p2align `4`
254	L(exit_null_on_fourth_vector):
255	sub %rdi, %rax
256	bsf %rdx, %rdx
257	add %rdx, %rax
258	add $(VEC_SIZE * `3`), %rax
259	jmp L(StartStrcpyPart)
260
261	.p2align `4`
262	L(exit_null_on_fifth_vector):
263	sub %rdi, %rax
264	bsf %rdx, %rdx
265	add %rdx, %rax
266	add $(VEC_SIZE * `4`), %rax
267
268	.p2align `4`
269	L(StartStrcpyPart):
270	lea (%r9, %rax), %rdi
271	mov %rsi, %rcx
272	mov %r9, %rax / save result /
273
274	# ifdef USE_AS_STRNCAT
275	test %r8, %r8
276	jz L(ExitZero)
277	# define USE_AS_STRNCPY
278	# endif
279
280	# include "strcpy-avx2.S"
281	#endif
282

Browse the source code of glibc/sysdeps/x86_64/multiarch/strcat-avx2.S