1/* strcat with SSE2
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCAT
24# define STRCAT __strcat_sse2_unaligned
25# endif
26
27# define USE_AS_STRCAT
28
29.text
30ENTRY (STRCAT)
31 mov %rdi, %r9
32# ifdef USE_AS_STRNCAT
33 mov %rdx, %r8
34# endif
35
36/* Inline corresponding strlen file, temporary until new strcpy
37 implementation gets merged. */
38
39 xor %rax, %rax
40 mov %edi, %ecx
41 and $0x3f, %ecx
42 pxor %xmm0, %xmm0
43 cmp $0x30, %ecx
44 ja L(next)
45 movdqu (%rdi), %xmm1
46 pcmpeqb %xmm1, %xmm0
47 pmovmskb %xmm0, %edx
48 test %edx, %edx
49 jnz L(exit_less16)
50 mov %rdi, %rax
51 and $-16, %rax
52 jmp L(align16_start)
53L(next):
54 mov %rdi, %rax
55 and $-16, %rax
56 pcmpeqb (%rax), %xmm0
57 mov $-1, %r10d
58 sub %rax, %rcx
59 shl %cl, %r10d
60 pmovmskb %xmm0, %edx
61 and %r10d, %edx
62 jnz L(exit)
63
64L(align16_start):
65 pxor %xmm0, %xmm0
66 pxor %xmm1, %xmm1
67 pxor %xmm2, %xmm2
68 pxor %xmm3, %xmm3
69 pcmpeqb 16(%rax), %xmm0
70 pmovmskb %xmm0, %edx
71 test %edx, %edx
72 jnz L(exit16)
73
74 pcmpeqb 32(%rax), %xmm1
75 pmovmskb %xmm1, %edx
76 test %edx, %edx
77 jnz L(exit32)
78
79 pcmpeqb 48(%rax), %xmm2
80 pmovmskb %xmm2, %edx
81 test %edx, %edx
82 jnz L(exit48)
83
84 pcmpeqb 64(%rax), %xmm3
85 pmovmskb %xmm3, %edx
86 test %edx, %edx
87 jnz L(exit64)
88
89 pcmpeqb 80(%rax), %xmm0
90 add $64, %rax
91 pmovmskb %xmm0, %edx
92 test %edx, %edx
93 jnz L(exit16)
94
95 pcmpeqb 32(%rax), %xmm1
96 pmovmskb %xmm1, %edx
97 test %edx, %edx
98 jnz L(exit32)
99
100 pcmpeqb 48(%rax), %xmm2
101 pmovmskb %xmm2, %edx
102 test %edx, %edx
103 jnz L(exit48)
104
105 pcmpeqb 64(%rax), %xmm3
106 pmovmskb %xmm3, %edx
107 test %edx, %edx
108 jnz L(exit64)
109
110 pcmpeqb 80(%rax), %xmm0
111 add $64, %rax
112 pmovmskb %xmm0, %edx
113 test %edx, %edx
114 jnz L(exit16)
115
116 pcmpeqb 32(%rax), %xmm1
117 pmovmskb %xmm1, %edx
118 test %edx, %edx
119 jnz L(exit32)
120
121 pcmpeqb 48(%rax), %xmm2
122 pmovmskb %xmm2, %edx
123 test %edx, %edx
124 jnz L(exit48)
125
126 pcmpeqb 64(%rax), %xmm3
127 pmovmskb %xmm3, %edx
128 test %edx, %edx
129 jnz L(exit64)
130
131 pcmpeqb 80(%rax), %xmm0
132 add $64, %rax
133 pmovmskb %xmm0, %edx
134 test %edx, %edx
135 jnz L(exit16)
136
137 pcmpeqb 32(%rax), %xmm1
138 pmovmskb %xmm1, %edx
139 test %edx, %edx
140 jnz L(exit32)
141
142 pcmpeqb 48(%rax), %xmm2
143 pmovmskb %xmm2, %edx
144 test %edx, %edx
145 jnz L(exit48)
146
147 pcmpeqb 64(%rax), %xmm3
148 pmovmskb %xmm3, %edx
149 test %edx, %edx
150 jnz L(exit64)
151
152 test $0x3f, %rax
153 jz L(align64_loop)
154
155 pcmpeqb 80(%rax), %xmm0
156 add $80, %rax
157 pmovmskb %xmm0, %edx
158 test %edx, %edx
159 jnz L(exit)
160
161 test $0x3f, %rax
162 jz L(align64_loop)
163
164 pcmpeqb 16(%rax), %xmm1
165 add $16, %rax
166 pmovmskb %xmm1, %edx
167 test %edx, %edx
168 jnz L(exit)
169
170 test $0x3f, %rax
171 jz L(align64_loop)
172
173 pcmpeqb 16(%rax), %xmm2
174 add $16, %rax
175 pmovmskb %xmm2, %edx
176 test %edx, %edx
177 jnz L(exit)
178
179 test $0x3f, %rax
180 jz L(align64_loop)
181
182 pcmpeqb 16(%rax), %xmm3
183 add $16, %rax
184 pmovmskb %xmm3, %edx
185 test %edx, %edx
186 jnz L(exit)
187
188 add $16, %rax
189 .p2align 4
190 L(align64_loop):
191 movaps (%rax), %xmm4
192 pminub 16(%rax), %xmm4
193 movaps 32(%rax), %xmm5
194 pminub 48(%rax), %xmm5
195 add $64, %rax
196 pminub %xmm4, %xmm5
197 pcmpeqb %xmm0, %xmm5
198 pmovmskb %xmm5, %edx
199 test %edx, %edx
200 jz L(align64_loop)
201
202 pcmpeqb -64(%rax), %xmm0
203 sub $80, %rax
204 pmovmskb %xmm0, %edx
205 test %edx, %edx
206 jnz L(exit16)
207
208 pcmpeqb 32(%rax), %xmm1
209 pmovmskb %xmm1, %edx
210 test %edx, %edx
211 jnz L(exit32)
212
213 pcmpeqb 48(%rax), %xmm2
214 pmovmskb %xmm2, %edx
215 test %edx, %edx
216 jnz L(exit48)
217
218 pcmpeqb 64(%rax), %xmm3
219 pmovmskb %xmm3, %edx
220 sub %rdi, %rax
221 bsf %rdx, %rdx
222 add %rdx, %rax
223 add $64, %rax
224 jmp L(StartStrcpyPart)
225
226 .p2align 4
227L(exit):
228 sub %rdi, %rax
229L(exit_less16):
230 bsf %rdx, %rdx
231 add %rdx, %rax
232 jmp L(StartStrcpyPart)
233
234 .p2align 4
235L(exit16):
236 sub %rdi, %rax
237 bsf %rdx, %rdx
238 add %rdx, %rax
239 add $16, %rax
240 jmp L(StartStrcpyPart)
241
242 .p2align 4
243L(exit32):
244 sub %rdi, %rax
245 bsf %rdx, %rdx
246 add %rdx, %rax
247 add $32, %rax
248 jmp L(StartStrcpyPart)
249
250 .p2align 4
251L(exit48):
252 sub %rdi, %rax
253 bsf %rdx, %rdx
254 add %rdx, %rax
255 add $48, %rax
256 jmp L(StartStrcpyPart)
257
258 .p2align 4
259L(exit64):
260 sub %rdi, %rax
261 bsf %rdx, %rdx
262 add %rdx, %rax
263 add $64, %rax
264
265 .p2align 4
266L(StartStrcpyPart):
267 lea (%r9, %rax), %rdi
268 mov %rsi, %rcx
269 mov %r9, %rax /* save result */
270
271# ifdef USE_AS_STRNCAT
272 test %r8, %r8
273 jz L(ExitZero)
274# define USE_AS_STRNCPY
275# endif
276
277# include "strcpy-sse2-unaligned.S"
278#endif
279