1/* strcat with SSE2
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22 so we need this to build for ISA V2 builds. */
23#if ISA_SHOULD_BUILD (2)
24
25
26# include <sysdep.h>
27
28# ifndef STRCAT
29# define STRCAT __strcat_sse2_unaligned
30# endif
31
32# define USE_AS_STRCAT
33
34.text
35ENTRY (STRCAT)
36 mov %rdi, %r9
37# ifdef USE_AS_STRNCAT
38 mov %rdx, %r8
39# endif
40
41/* Inline corresponding strlen file, temporary until new strcpy
42 implementation gets merged. */
43
44 xor %rax, %rax
45 mov %edi, %ecx
46 and $0x3f, %ecx
47 pxor %xmm0, %xmm0
48 cmp $0x30, %ecx
49 ja L(next)
50 movdqu (%rdi), %xmm1
51 pcmpeqb %xmm1, %xmm0
52 pmovmskb %xmm0, %edx
53 test %edx, %edx
54 jnz L(exit_less16)
55 mov %rdi, %rax
56 and $-16, %rax
57 jmp L(align16_start)
58L(next):
59 mov %rdi, %rax
60 and $-16, %rax
61 pcmpeqb (%rax), %xmm0
62 mov $-1, %r10d
63 sub %rax, %rcx
64 shl %cl, %r10d
65 pmovmskb %xmm0, %edx
66 and %r10d, %edx
67 jnz L(exit)
68
69L(align16_start):
70 pxor %xmm0, %xmm0
71 pxor %xmm1, %xmm1
72 pxor %xmm2, %xmm2
73 pxor %xmm3, %xmm3
74 pcmpeqb 16(%rax), %xmm0
75 pmovmskb %xmm0, %edx
76 test %edx, %edx
77 jnz L(exit16)
78
79 pcmpeqb 32(%rax), %xmm1
80 pmovmskb %xmm1, %edx
81 test %edx, %edx
82 jnz L(exit32)
83
84 pcmpeqb 48(%rax), %xmm2
85 pmovmskb %xmm2, %edx
86 test %edx, %edx
87 jnz L(exit48)
88
89 pcmpeqb 64(%rax), %xmm3
90 pmovmskb %xmm3, %edx
91 test %edx, %edx
92 jnz L(exit64)
93
94 pcmpeqb 80(%rax), %xmm0
95 add $64, %rax
96 pmovmskb %xmm0, %edx
97 test %edx, %edx
98 jnz L(exit16)
99
100 pcmpeqb 32(%rax), %xmm1
101 pmovmskb %xmm1, %edx
102 test %edx, %edx
103 jnz L(exit32)
104
105 pcmpeqb 48(%rax), %xmm2
106 pmovmskb %xmm2, %edx
107 test %edx, %edx
108 jnz L(exit48)
109
110 pcmpeqb 64(%rax), %xmm3
111 pmovmskb %xmm3, %edx
112 test %edx, %edx
113 jnz L(exit64)
114
115 pcmpeqb 80(%rax), %xmm0
116 add $64, %rax
117 pmovmskb %xmm0, %edx
118 test %edx, %edx
119 jnz L(exit16)
120
121 pcmpeqb 32(%rax), %xmm1
122 pmovmskb %xmm1, %edx
123 test %edx, %edx
124 jnz L(exit32)
125
126 pcmpeqb 48(%rax), %xmm2
127 pmovmskb %xmm2, %edx
128 test %edx, %edx
129 jnz L(exit48)
130
131 pcmpeqb 64(%rax), %xmm3
132 pmovmskb %xmm3, %edx
133 test %edx, %edx
134 jnz L(exit64)
135
136 pcmpeqb 80(%rax), %xmm0
137 add $64, %rax
138 pmovmskb %xmm0, %edx
139 test %edx, %edx
140 jnz L(exit16)
141
142 pcmpeqb 32(%rax), %xmm1
143 pmovmskb %xmm1, %edx
144 test %edx, %edx
145 jnz L(exit32)
146
147 pcmpeqb 48(%rax), %xmm2
148 pmovmskb %xmm2, %edx
149 test %edx, %edx
150 jnz L(exit48)
151
152 pcmpeqb 64(%rax), %xmm3
153 pmovmskb %xmm3, %edx
154 test %edx, %edx
155 jnz L(exit64)
156
157 test $0x3f, %rax
158 jz L(align64_loop)
159
160 pcmpeqb 80(%rax), %xmm0
161 add $80, %rax
162 pmovmskb %xmm0, %edx
163 test %edx, %edx
164 jnz L(exit)
165
166 test $0x3f, %rax
167 jz L(align64_loop)
168
169 pcmpeqb 16(%rax), %xmm1
170 add $16, %rax
171 pmovmskb %xmm1, %edx
172 test %edx, %edx
173 jnz L(exit)
174
175 test $0x3f, %rax
176 jz L(align64_loop)
177
178 pcmpeqb 16(%rax), %xmm2
179 add $16, %rax
180 pmovmskb %xmm2, %edx
181 test %edx, %edx
182 jnz L(exit)
183
184 test $0x3f, %rax
185 jz L(align64_loop)
186
187 pcmpeqb 16(%rax), %xmm3
188 add $16, %rax
189 pmovmskb %xmm3, %edx
190 test %edx, %edx
191 jnz L(exit)
192
193 add $16, %rax
194 .p2align 4
195 L(align64_loop):
196 movaps (%rax), %xmm4
197 pminub 16(%rax), %xmm4
198 movaps 32(%rax), %xmm5
199 pminub 48(%rax), %xmm5
200 add $64, %rax
201 pminub %xmm4, %xmm5
202 pcmpeqb %xmm0, %xmm5
203 pmovmskb %xmm5, %edx
204 test %edx, %edx
205 jz L(align64_loop)
206
207 pcmpeqb -64(%rax), %xmm0
208 sub $80, %rax
209 pmovmskb %xmm0, %edx
210 test %edx, %edx
211 jnz L(exit16)
212
213 pcmpeqb 32(%rax), %xmm1
214 pmovmskb %xmm1, %edx
215 test %edx, %edx
216 jnz L(exit32)
217
218 pcmpeqb 48(%rax), %xmm2
219 pmovmskb %xmm2, %edx
220 test %edx, %edx
221 jnz L(exit48)
222
223 pcmpeqb 64(%rax), %xmm3
224 pmovmskb %xmm3, %edx
225 sub %rdi, %rax
226 bsf %rdx, %rdx
227 add %rdx, %rax
228 add $64, %rax
229 jmp L(StartStrcpyPart)
230
231 .p2align 4
232L(exit):
233 sub %rdi, %rax
234L(exit_less16):
235 bsf %rdx, %rdx
236 add %rdx, %rax
237 jmp L(StartStrcpyPart)
238
239 .p2align 4
240L(exit16):
241 sub %rdi, %rax
242 bsf %rdx, %rdx
243 add %rdx, %rax
244 add $16, %rax
245 jmp L(StartStrcpyPart)
246
247 .p2align 4
248L(exit32):
249 sub %rdi, %rax
250 bsf %rdx, %rdx
251 add %rdx, %rax
252 add $32, %rax
253 jmp L(StartStrcpyPart)
254
255 .p2align 4
256L(exit48):
257 sub %rdi, %rax
258 bsf %rdx, %rdx
259 add %rdx, %rax
260 add $48, %rax
261 jmp L(StartStrcpyPart)
262
263 .p2align 4
264L(exit64):
265 sub %rdi, %rax
266 bsf %rdx, %rdx
267 add %rdx, %rax
268 add $64, %rax
269
270 .p2align 4
271L(StartStrcpyPart):
272 lea (%r9, %rax), %rdi
273 mov %rsi, %rcx
274 mov %r9, %rax /* save result */
275
276# ifdef USE_AS_STRNCAT
277 test %r8, %r8
278 jz L(ExitZero)
279# define USE_AS_STRNCPY
280# endif
281
282# include "strcpy-sse2-unaligned.S"
283#endif
284