1/* SSE2 version of strlen and SSE4.1 version of wcslen.
2 Copyright (C) 2012-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifdef AS_WCSLEN
22# define PMINU pminud
23# define PCMPEQ pcmpeqd
24# define SHIFT_RETURN shrq $2, %rax
25#else
26# define PMINU pminub
27# define PCMPEQ pcmpeqb
28# define SHIFT_RETURN
29#endif
30
31/* Long lived register in strlen(s), strnlen(s, n) are:
32
33 %xmm3 - zero
34 %rdi - s
35 %r10 (s+n) & (~(64-1))
36 %r11 s+n
37*/
38
39
40.text
41ENTRY(strlen)
42
43/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
44#define FIND_ZERO \
45 PCMPEQ (%rax), %xmm0; \
46 PCMPEQ 16(%rax), %xmm1; \
47 PCMPEQ 32(%rax), %xmm2; \
48 PCMPEQ 48(%rax), %xmm3; \
49 pmovmskb %xmm0, %esi; \
50 pmovmskb %xmm1, %edx; \
51 pmovmskb %xmm2, %r8d; \
52 pmovmskb %xmm3, %ecx; \
53 salq $16, %rdx; \
54 salq $16, %rcx; \
55 orq %rsi, %rdx; \
56 orq %r8, %rcx; \
57 salq $32, %rcx; \
58 orq %rcx, %rdx;
59
60#ifdef AS_STRNLEN
61/* Do not read anything when n==0. */
62 test %RSI_LP, %RSI_LP
63 jne L(n_nonzero)
64 xor %rax, %rax
65 ret
66L(n_nonzero):
67# ifdef AS_WCSLEN
68/* Check for overflow from maxlen * sizeof(wchar_t). If it would
69 overflow the only way this program doesn't have undefined behavior
70 is if there is a null terminator in valid memory so wcslen will
71 suffice. */
72 mov %RSI_LP, %R10_LP
73 sar $62, %R10_LP
74 jnz __wcslen_sse4_1
75 sal $2, %RSI_LP
76# endif
77
78/* Initialize long lived registers. */
79 add %RDI_LP, %RSI_LP
80 mov %RSI_LP, %R10_LP
81 and $-64, %R10_LP
82 mov %RSI_LP, %R11_LP
83#endif
84
85 pxor %xmm0, %xmm0
86 pxor %xmm1, %xmm1
87 pxor %xmm2, %xmm2
88 pxor %xmm3, %xmm3
89 movq %rdi, %rax
90 movq %rdi, %rcx
91 andq $4095, %rcx
92/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
93 cmpq $4047, %rcx
94/* We cannot unify this branching as it would be ~6 cycles slower. */
95 ja L(cross_page)
96
97#ifdef AS_STRNLEN
98/* Test if end is among first 64 bytes. */
99# define STRNLEN_PROLOG \
100 mov %r11, %rsi; \
101 subq %rax, %rsi; \
102 andq $-64, %rax; \
103 testq $-64, %rsi; \
104 je L(strnlen_ret)
105#else
106# define STRNLEN_PROLOG andq $-64, %rax;
107#endif
108
109/* Ignore bits in mask that come before start of string. */
110#define PROLOG(lab) \
111 movq %rdi, %rcx; \
112 xorq %rax, %rcx; \
113 STRNLEN_PROLOG; \
114 sarq %cl, %rdx; \
115 test %rdx, %rdx; \
116 je L(lab); \
117 bsfq %rdx, %rax; \
118 SHIFT_RETURN; \
119 ret
120
121#ifdef AS_STRNLEN
122 andq $-16, %rax
123 FIND_ZERO
124#else
125 /* Test first 16 bytes unaligned. */
126 movdqu (%rax), %xmm4
127 PCMPEQ %xmm0, %xmm4
128 pmovmskb %xmm4, %edx
129 test %edx, %edx
130 je L(next48_bytes)
131 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
132 SHIFT_RETURN
133 ret
134
135L(next48_bytes):
136/* Same as FIND_ZERO except we do not check first 16 bytes. */
137 andq $-16, %rax
138 PCMPEQ 16(%rax), %xmm1
139 PCMPEQ 32(%rax), %xmm2
140 PCMPEQ 48(%rax), %xmm3
141 pmovmskb %xmm1, %edx
142 pmovmskb %xmm2, %r8d
143 pmovmskb %xmm3, %ecx
144 salq $16, %rdx
145 salq $16, %rcx
146 orq %r8, %rcx
147 salq $32, %rcx
148 orq %rcx, %rdx
149#endif
150
151 /* When no zero byte is found xmm1-3 are zero so we do not have to
152 zero them. */
153 PROLOG(loop)
154
155 .p2align 4
156L(cross_page):
157 andq $-64, %rax
158 FIND_ZERO
159 PROLOG(loop_init)
160
161#ifdef AS_STRNLEN
162/* We must do this check to correctly handle strnlen (s, -1). */
163L(strnlen_ret):
164 bts %rsi, %rdx
165 sarq %cl, %rdx
166 test %rdx, %rdx
167 je L(loop_init)
168 bsfq %rdx, %rax
169 SHIFT_RETURN
170 ret
171#endif
172 .p2align 4
173L(loop_init):
174 pxor %xmm1, %xmm1
175 pxor %xmm2, %xmm2
176 pxor %xmm3, %xmm3
177#ifdef AS_STRNLEN
178 .p2align 4
179L(loop):
180
181 addq $64, %rax
182 cmpq %rax, %r10
183 je L(exit_end)
184
185 movdqa (%rax), %xmm0
186 PMINU 16(%rax), %xmm0
187 PMINU 32(%rax), %xmm0
188 PMINU 48(%rax), %xmm0
189 PCMPEQ %xmm3, %xmm0
190 pmovmskb %xmm0, %edx
191 testl %edx, %edx
192 jne L(exit)
193 jmp L(loop)
194
195 .p2align 4
196L(exit_end):
197 cmp %rax, %r11
198 je L(first) /* Do not read when end is at page boundary. */
199 pxor %xmm0, %xmm0
200 FIND_ZERO
201
202L(first):
203 bts %r11, %rdx
204 bsfq %rdx, %rdx
205 addq %rdx, %rax
206 subq %rdi, %rax
207 SHIFT_RETURN
208 ret
209
210 .p2align 4
211L(exit):
212 pxor %xmm0, %xmm0
213 FIND_ZERO
214
215 bsfq %rdx, %rdx
216 addq %rdx, %rax
217 subq %rdi, %rax
218 SHIFT_RETURN
219 ret
220
221#else
222
223 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
224 .p2align 4
225L(loop):
226
227 movdqa 64(%rax), %xmm0
228 PMINU 80(%rax), %xmm0
229 PMINU 96(%rax), %xmm0
230 PMINU 112(%rax), %xmm0
231 PCMPEQ %xmm3, %xmm0
232 pmovmskb %xmm0, %edx
233 testl %edx, %edx
234 jne L(exit64)
235
236 subq $-128, %rax
237
238 movdqa (%rax), %xmm0
239 PMINU 16(%rax), %xmm0
240 PMINU 32(%rax), %xmm0
241 PMINU 48(%rax), %xmm0
242 PCMPEQ %xmm3, %xmm0
243 pmovmskb %xmm0, %edx
244 testl %edx, %edx
245 jne L(exit0)
246 jmp L(loop)
247
248 .p2align 4
249L(exit64):
250 addq $64, %rax
251L(exit0):
252 pxor %xmm0, %xmm0
253 FIND_ZERO
254
255 bsfq %rdx, %rdx
256 addq %rdx, %rax
257 subq %rdi, %rax
258 SHIFT_RETURN
259 ret
260
261#endif
262
263END(strlen)
264