1/* Optimized wcslen for x86-64 with SSE2.
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21 .text
22ENTRY (__wcslen)
23 cmpl $0, (%rdi)
24 jz L(exit_tail0)
25 cmpl $0, 4(%rdi)
26 jz L(exit_tail1)
27 cmpl $0, 8(%rdi)
28 jz L(exit_tail2)
29 cmpl $0, 12(%rdi)
30 jz L(exit_tail3)
31 cmpl $0, 16(%rdi)
32 jz L(exit_tail4)
33 cmpl $0, 20(%rdi)
34 jz L(exit_tail5)
35 cmpl $0, 24(%rdi)
36 jz L(exit_tail6)
37 cmpl $0, 28(%rdi)
38 jz L(exit_tail7)
39
40 pxor %xmm0, %xmm0
41
42 lea 32(%rdi), %rax
43 lea 16(%rdi), %rcx
44 and $-16, %rax
45
46 pcmpeqd (%rax), %xmm0
47 pmovmskb %xmm0, %edx
48 pxor %xmm1, %xmm1
49 test %edx, %edx
50 lea 16(%rax), %rax
51 jnz L(exit)
52
53 pcmpeqd (%rax), %xmm1
54 pmovmskb %xmm1, %edx
55 pxor %xmm2, %xmm2
56 test %edx, %edx
57 lea 16(%rax), %rax
58 jnz L(exit)
59
60 pcmpeqd (%rax), %xmm2
61 pmovmskb %xmm2, %edx
62 pxor %xmm3, %xmm3
63 test %edx, %edx
64 lea 16(%rax), %rax
65 jnz L(exit)
66
67 pcmpeqd (%rax), %xmm3
68 pmovmskb %xmm3, %edx
69 test %edx, %edx
70 lea 16(%rax), %rax
71 jnz L(exit)
72
73 pcmpeqd (%rax), %xmm0
74 pmovmskb %xmm0, %edx
75 test %edx, %edx
76 lea 16(%rax), %rax
77 jnz L(exit)
78
79 pcmpeqd (%rax), %xmm1
80 pmovmskb %xmm1, %edx
81 test %edx, %edx
82 lea 16(%rax), %rax
83 jnz L(exit)
84
85 pcmpeqd (%rax), %xmm2
86 pmovmskb %xmm2, %edx
87 test %edx, %edx
88 lea 16(%rax), %rax
89 jnz L(exit)
90
91 pcmpeqd (%rax), %xmm3
92 pmovmskb %xmm3, %edx
93 test %edx, %edx
94 lea 16(%rax), %rax
95 jnz L(exit)
96
97 pcmpeqd (%rax), %xmm0
98 pmovmskb %xmm0, %edx
99 test %edx, %edx
100 lea 16(%rax), %rax
101 jnz L(exit)
102
103 pcmpeqd (%rax), %xmm1
104 pmovmskb %xmm1, %edx
105 test %edx, %edx
106 lea 16(%rax), %rax
107 jnz L(exit)
108
109 pcmpeqd (%rax), %xmm2
110 pmovmskb %xmm2, %edx
111 test %edx, %edx
112 lea 16(%rax), %rax
113 jnz L(exit)
114
115 pcmpeqd (%rax), %xmm3
116 pmovmskb %xmm3, %edx
117 test %edx, %edx
118 lea 16(%rax), %rax
119 jnz L(exit)
120
121 and $-0x40, %rax
122
123 .p2align 4
124L(aligned_64_loop):
125 movaps (%rax), %xmm0
126 movaps 16(%rax), %xmm1
127 movaps 32(%rax), %xmm2
128 movaps 48(%rax), %xmm6
129
130 pminub %xmm1, %xmm0
131 pminub %xmm6, %xmm2
132 pminub %xmm0, %xmm2
133 pcmpeqd %xmm3, %xmm2
134 pmovmskb %xmm2, %edx
135 test %edx, %edx
136 lea 64(%rax), %rax
137 jz L(aligned_64_loop)
138
139 pcmpeqd -64(%rax), %xmm3
140 pmovmskb %xmm3, %edx
141 test %edx, %edx
142 lea 48(%rcx), %rcx
143 jnz L(exit)
144
145 pcmpeqd %xmm1, %xmm3
146 pmovmskb %xmm3, %edx
147 test %edx, %edx
148 lea -16(%rcx), %rcx
149 jnz L(exit)
150
151 pcmpeqd -32(%rax), %xmm3
152 pmovmskb %xmm3, %edx
153 test %edx, %edx
154 lea -16(%rcx), %rcx
155 jnz L(exit)
156
157 pcmpeqd %xmm6, %xmm3
158 pmovmskb %xmm3, %edx
159 test %edx, %edx
160 lea -16(%rcx), %rcx
161 jnz L(exit)
162
163 jmp L(aligned_64_loop)
164
165 .p2align 4
166L(exit):
167 sub %rcx, %rax
168 shr $2, %rax
169 test %dl, %dl
170 jz L(exit_high)
171
172 mov %dl, %cl
173 and $15, %cl
174 jz L(exit_1)
175 ret
176
177 .p2align 4
178L(exit_high):
179 mov %dh, %ch
180 and $15, %ch
181 jz L(exit_3)
182 add $2, %rax
183 ret
184
185 .p2align 4
186L(exit_1):
187 add $1, %rax
188 ret
189
190 .p2align 4
191L(exit_3):
192 add $3, %rax
193 ret
194
195 .p2align 4
196L(exit_tail0):
197 xor %rax, %rax
198 ret
199
200 .p2align 4
201L(exit_tail1):
202 mov $1, %rax
203 ret
204
205 .p2align 4
206L(exit_tail2):
207 mov $2, %rax
208 ret
209
210 .p2align 4
211L(exit_tail3):
212 mov $3, %rax
213 ret
214
215 .p2align 4
216L(exit_tail4):
217 mov $4, %rax
218 ret
219
220 .p2align 4
221L(exit_tail5):
222 mov $5, %rax
223 ret
224
225 .p2align 4
226L(exit_tail6):
227 mov $6, %rax
228 ret
229
230 .p2align 4
231L(exit_tail7):
232 mov $7, %rax
233 ret
234
235END (__wcslen)
236
237weak_alias(__wcslen, wcslen)
238