1/* strcat with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23
24# include <sysdep.h>
25
26# ifndef STRCAT
27# define STRCAT __strcat_evex
28# endif
29
30# define VMOVU vmovdqu64
31# define VMOVA vmovdqa64
32
33/* zero register */
34# define XMMZERO xmm16
35# define YMMZERO ymm16
36# define YMM0 ymm17
37# define YMM1 ymm18
38
39# define USE_AS_STRCAT
40
41/* Number of bytes in a vector register */
42# define VEC_SIZE 32
43
44 .section .text.evex,"ax",@progbits
45ENTRY (STRCAT)
46 mov %rdi, %r9
47# ifdef USE_AS_STRNCAT
48 mov %rdx, %r8
49# endif
50
51 xor %eax, %eax
52 mov %edi, %ecx
53 and $((VEC_SIZE * 4) - 1), %ecx
54 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
55 cmp $(VEC_SIZE * 3), %ecx
56 ja L(fourth_vector_boundary)
57 vpcmpb $0, (%rdi), %YMMZERO, %k0
58 kmovd %k0, %edx
59 test %edx, %edx
60 jnz L(exit_null_on_first_vector)
61 mov %rdi, %rax
62 and $-VEC_SIZE, %rax
63 jmp L(align_vec_size_start)
64L(fourth_vector_boundary):
65 mov %rdi, %rax
66 and $-VEC_SIZE, %rax
67 vpcmpb $0, (%rax), %YMMZERO, %k0
68 mov $-1, %r10d
69 sub %rax, %rcx
70 shl %cl, %r10d
71 kmovd %k0, %edx
72 and %r10d, %edx
73 jnz L(exit)
74
75L(align_vec_size_start):
76 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
77 kmovd %k0, %edx
78 test %edx, %edx
79 jnz L(exit_null_on_second_vector)
80
81 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
82 kmovd %k1, %edx
83 test %edx, %edx
84 jnz L(exit_null_on_third_vector)
85
86 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
87 kmovd %k2, %edx
88 test %edx, %edx
89 jnz L(exit_null_on_fourth_vector)
90
91 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
92 kmovd %k3, %edx
93 test %edx, %edx
94 jnz L(exit_null_on_fifth_vector)
95
96 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
97 add $(VEC_SIZE * 4), %rax
98 kmovd %k4, %edx
99 test %edx, %edx
100 jnz L(exit_null_on_second_vector)
101
102 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
103 kmovd %k1, %edx
104 test %edx, %edx
105 jnz L(exit_null_on_third_vector)
106
107 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
108 kmovd %k2, %edx
109 test %edx, %edx
110 jnz L(exit_null_on_fourth_vector)
111
112 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
113 kmovd %k3, %edx
114 test %edx, %edx
115 jnz L(exit_null_on_fifth_vector)
116
117 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
118 kmovd %k4, %edx
119 add $(VEC_SIZE * 4), %rax
120 test %edx, %edx
121 jnz L(exit_null_on_second_vector)
122
123 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
124 kmovd %k1, %edx
125 test %edx, %edx
126 jnz L(exit_null_on_third_vector)
127
128 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
129 kmovd %k2, %edx
130 test %edx, %edx
131 jnz L(exit_null_on_fourth_vector)
132
133 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
134 kmovd %k3, %edx
135 test %edx, %edx
136 jnz L(exit_null_on_fifth_vector)
137
138 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
139 add $(VEC_SIZE * 4), %rax
140 kmovd %k4, %edx
141 test %edx, %edx
142 jnz L(exit_null_on_second_vector)
143
144 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
145 kmovd %k1, %edx
146 test %edx, %edx
147 jnz L(exit_null_on_third_vector)
148
149 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
150 kmovd %k2, %edx
151 test %edx, %edx
152 jnz L(exit_null_on_fourth_vector)
153
154 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
155 kmovd %k3, %edx
156 test %edx, %edx
157 jnz L(exit_null_on_fifth_vector)
158
159 test $((VEC_SIZE * 4) - 1), %rax
160 jz L(align_four_vec_loop)
161
162 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
163 add $(VEC_SIZE * 5), %rax
164 kmovd %k4, %edx
165 test %edx, %edx
166 jnz L(exit)
167
168 test $((VEC_SIZE * 4) - 1), %rax
169 jz L(align_four_vec_loop)
170
171 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
172 add $VEC_SIZE, %rax
173 kmovd %k0, %edx
174 test %edx, %edx
175 jnz L(exit)
176
177 test $((VEC_SIZE * 4) - 1), %rax
178 jz L(align_four_vec_loop)
179
180 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
181 add $VEC_SIZE, %rax
182 kmovd %k0, %edx
183 test %edx, %edx
184 jnz L(exit)
185
186 test $((VEC_SIZE * 4) - 1), %rax
187 jz L(align_four_vec_loop)
188
189 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
190 add $VEC_SIZE, %rax
191 kmovd %k1, %edx
192 test %edx, %edx
193 jnz L(exit)
194
195 add $VEC_SIZE, %rax
196
197 .p2align 4
198L(align_four_vec_loop):
199 VMOVA (%rax), %YMM0
200 VMOVA (VEC_SIZE * 2)(%rax), %YMM1
201 vpminub VEC_SIZE(%rax), %YMM0, %YMM0
202 vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
203 vpminub %YMM0, %YMM1, %YMM0
204 /* If K0 != 0, there is a null byte. */
205 vpcmpb $0, %YMM0, %YMMZERO, %k0
206 add $(VEC_SIZE * 4), %rax
207 ktestd %k0, %k0
208 jz L(align_four_vec_loop)
209
210 vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
211 sub $(VEC_SIZE * 5), %rax
212 kmovd %k0, %edx
213 test %edx, %edx
214 jnz L(exit_null_on_second_vector)
215
216 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
217 kmovd %k1, %edx
218 test %edx, %edx
219 jnz L(exit_null_on_third_vector)
220
221 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
222 kmovd %k2, %edx
223 test %edx, %edx
224 jnz L(exit_null_on_fourth_vector)
225
226 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
227 kmovd %k3, %edx
228 sub %rdi, %rax
229 bsf %rdx, %rdx
230 add %rdx, %rax
231 add $(VEC_SIZE * 4), %rax
232 jmp L(StartStrcpyPart)
233
234 .p2align 4
235L(exit):
236 sub %rdi, %rax
237L(exit_null_on_first_vector):
238 bsf %rdx, %rdx
239 add %rdx, %rax
240 jmp L(StartStrcpyPart)
241
242 .p2align 4
243L(exit_null_on_second_vector):
244 sub %rdi, %rax
245 bsf %rdx, %rdx
246 add %rdx, %rax
247 add $VEC_SIZE, %rax
248 jmp L(StartStrcpyPart)
249
250 .p2align 4
251L(exit_null_on_third_vector):
252 sub %rdi, %rax
253 bsf %rdx, %rdx
254 add %rdx, %rax
255 add $(VEC_SIZE * 2), %rax
256 jmp L(StartStrcpyPart)
257
258 .p2align 4
259L(exit_null_on_fourth_vector):
260 sub %rdi, %rax
261 bsf %rdx, %rdx
262 add %rdx, %rax
263 add $(VEC_SIZE * 3), %rax
264 jmp L(StartStrcpyPart)
265
266 .p2align 4
267L(exit_null_on_fifth_vector):
268 sub %rdi, %rax
269 bsf %rdx, %rdx
270 add %rdx, %rax
271 add $(VEC_SIZE * 4), %rax
272
273 .p2align 4
274L(StartStrcpyPart):
275 lea (%r9, %rax), %rdi
276 mov %rsi, %rcx
277 mov %r9, %rax /* save result */
278
279# ifdef USE_AS_STRNCAT
280 test %r8, %r8
281 jz L(ExitZero)
282# define USE_AS_STRNCPY
283# endif
284
285# include "strcpy-evex.S"
286#endif
287