1/* strcat with 256-bit EVEX instructions.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCAT
24# define STRCAT __strcat_evex
25# endif
26
27# define VMOVU vmovdqu64
28# define VMOVA vmovdqa64
29
30/* zero register */
31# define XMMZERO xmm16
32# define YMMZERO ymm16
33# define YMM0 ymm17
34# define YMM1 ymm18
35
36# define USE_AS_STRCAT
37
38/* Number of bytes in a vector register */
39# define VEC_SIZE 32
40
41 .section .text.evex,"ax",@progbits
42ENTRY (STRCAT)
43 mov %rdi, %r9
44# ifdef USE_AS_STRNCAT
45 mov %rdx, %r8
46# endif
47
48 xor %eax, %eax
49 mov %edi, %ecx
50 and $((VEC_SIZE * 4) - 1), %ecx
51 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
52 cmp $(VEC_SIZE * 3), %ecx
53 ja L(fourth_vector_boundary)
54 vpcmpb $0, (%rdi), %YMMZERO, %k0
55 kmovd %k0, %edx
56 test %edx, %edx
57 jnz L(exit_null_on_first_vector)
58 mov %rdi, %rax
59 and $-VEC_SIZE, %rax
60 jmp L(align_vec_size_start)
61L(fourth_vector_boundary):
62 mov %rdi, %rax
63 and $-VEC_SIZE, %rax
64 vpcmpb $0, (%rax), %YMMZERO, %k0
65 mov $-1, %r10d
66 sub %rax, %rcx
67 shl %cl, %r10d
68 kmovd %k0, %edx
69 and %r10d, %edx
70 jnz L(exit)
71
72L(align_vec_size_start):
73 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
74 kmovd %k0, %edx
75 test %edx, %edx
76 jnz L(exit_null_on_second_vector)
77
78 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
79 kmovd %k1, %edx
80 test %edx, %edx
81 jnz L(exit_null_on_third_vector)
82
83 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
84 kmovd %k2, %edx
85 test %edx, %edx
86 jnz L(exit_null_on_fourth_vector)
87
88 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
89 kmovd %k3, %edx
90 test %edx, %edx
91 jnz L(exit_null_on_fifth_vector)
92
93 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
94 add $(VEC_SIZE * 4), %rax
95 kmovd %k4, %edx
96 test %edx, %edx
97 jnz L(exit_null_on_second_vector)
98
99 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
100 kmovd %k1, %edx
101 test %edx, %edx
102 jnz L(exit_null_on_third_vector)
103
104 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
105 kmovd %k2, %edx
106 test %edx, %edx
107 jnz L(exit_null_on_fourth_vector)
108
109 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
110 kmovd %k3, %edx
111 test %edx, %edx
112 jnz L(exit_null_on_fifth_vector)
113
114 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
115 kmovd %k4, %edx
116 add $(VEC_SIZE * 4), %rax
117 test %edx, %edx
118 jnz L(exit_null_on_second_vector)
119
120 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
121 kmovd %k1, %edx
122 test %edx, %edx
123 jnz L(exit_null_on_third_vector)
124
125 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
126 kmovd %k2, %edx
127 test %edx, %edx
128 jnz L(exit_null_on_fourth_vector)
129
130 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
131 kmovd %k3, %edx
132 test %edx, %edx
133 jnz L(exit_null_on_fifth_vector)
134
135 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
136 add $(VEC_SIZE * 4), %rax
137 kmovd %k4, %edx
138 test %edx, %edx
139 jnz L(exit_null_on_second_vector)
140
141 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
142 kmovd %k1, %edx
143 test %edx, %edx
144 jnz L(exit_null_on_third_vector)
145
146 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
147 kmovd %k2, %edx
148 test %edx, %edx
149 jnz L(exit_null_on_fourth_vector)
150
151 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
152 kmovd %k3, %edx
153 test %edx, %edx
154 jnz L(exit_null_on_fifth_vector)
155
156 test $((VEC_SIZE * 4) - 1), %rax
157 jz L(align_four_vec_loop)
158
159 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
160 add $(VEC_SIZE * 5), %rax
161 kmovd %k4, %edx
162 test %edx, %edx
163 jnz L(exit)
164
165 test $((VEC_SIZE * 4) - 1), %rax
166 jz L(align_four_vec_loop)
167
168 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
169 add $VEC_SIZE, %rax
170 kmovd %k0, %edx
171 test %edx, %edx
172 jnz L(exit)
173
174 test $((VEC_SIZE * 4) - 1), %rax
175 jz L(align_four_vec_loop)
176
177 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
178 add $VEC_SIZE, %rax
179 kmovd %k0, %edx
180 test %edx, %edx
181 jnz L(exit)
182
183 test $((VEC_SIZE * 4) - 1), %rax
184 jz L(align_four_vec_loop)
185
186 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
187 add $VEC_SIZE, %rax
188 kmovd %k1, %edx
189 test %edx, %edx
190 jnz L(exit)
191
192 add $VEC_SIZE, %rax
193
194 .p2align 4
195L(align_four_vec_loop):
196 VMOVA (%rax), %YMM0
197 VMOVA (VEC_SIZE * 2)(%rax), %YMM1
198 vpminub VEC_SIZE(%rax), %YMM0, %YMM0
199 vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
200 vpminub %YMM0, %YMM1, %YMM0
201 /* If K0 != 0, there is a null byte. */
202 vpcmpb $0, %YMM0, %YMMZERO, %k0
203 add $(VEC_SIZE * 4), %rax
204 ktestd %k0, %k0
205 jz L(align_four_vec_loop)
206
207 vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
208 sub $(VEC_SIZE * 5), %rax
209 kmovd %k0, %edx
210 test %edx, %edx
211 jnz L(exit_null_on_second_vector)
212
213 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
214 kmovd %k1, %edx
215 test %edx, %edx
216 jnz L(exit_null_on_third_vector)
217
218 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
219 kmovd %k2, %edx
220 test %edx, %edx
221 jnz L(exit_null_on_fourth_vector)
222
223 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
224 kmovd %k3, %edx
225 sub %rdi, %rax
226 bsf %rdx, %rdx
227 add %rdx, %rax
228 add $(VEC_SIZE * 4), %rax
229 jmp L(StartStrcpyPart)
230
231 .p2align 4
232L(exit):
233 sub %rdi, %rax
234L(exit_null_on_first_vector):
235 bsf %rdx, %rdx
236 add %rdx, %rax
237 jmp L(StartStrcpyPart)
238
239 .p2align 4
240L(exit_null_on_second_vector):
241 sub %rdi, %rax
242 bsf %rdx, %rdx
243 add %rdx, %rax
244 add $VEC_SIZE, %rax
245 jmp L(StartStrcpyPart)
246
247 .p2align 4
248L(exit_null_on_third_vector):
249 sub %rdi, %rax
250 bsf %rdx, %rdx
251 add %rdx, %rax
252 add $(VEC_SIZE * 2), %rax
253 jmp L(StartStrcpyPart)
254
255 .p2align 4
256L(exit_null_on_fourth_vector):
257 sub %rdi, %rax
258 bsf %rdx, %rdx
259 add %rdx, %rax
260 add $(VEC_SIZE * 3), %rax
261 jmp L(StartStrcpyPart)
262
263 .p2align 4
264L(exit_null_on_fifth_vector):
265 sub %rdi, %rax
266 bsf %rdx, %rdx
267 add %rdx, %rax
268 add $(VEC_SIZE * 4), %rax
269
270 .p2align 4
271L(StartStrcpyPart):
272 lea (%r9, %rax), %rdi
273 mov %rsi, %rcx
274 mov %r9, %rax /* save result */
275
276# ifdef USE_AS_STRNCAT
277 test %r8, %r8
278 jz L(ExitZero)
279# define USE_AS_STRNCPY
280# endif
281
282# include "strcpy-evex.S"
283#endif
284