1 | /* Optimized wcslen for x86-64 with SSE2. |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | .text |
22 | ENTRY (__wcslen) |
23 | cmpl $0, (%rdi) |
24 | jz L(exit_tail0) |
25 | cmpl $0, 4(%rdi) |
26 | jz L(exit_tail1) |
27 | cmpl $0, 8(%rdi) |
28 | jz L(exit_tail2) |
29 | cmpl $0, 12(%rdi) |
30 | jz L(exit_tail3) |
31 | cmpl $0, 16(%rdi) |
32 | jz L(exit_tail4) |
33 | cmpl $0, 20(%rdi) |
34 | jz L(exit_tail5) |
35 | cmpl $0, 24(%rdi) |
36 | jz L(exit_tail6) |
37 | cmpl $0, 28(%rdi) |
38 | jz L(exit_tail7) |
39 | |
40 | pxor %xmm0, %xmm0 |
41 | |
42 | lea 32(%rdi), %rax |
43 | lea 16(%rdi), %rcx |
44 | and $-16, %rax |
45 | |
46 | pcmpeqd (%rax), %xmm0 |
47 | pmovmskb %xmm0, %edx |
48 | pxor %xmm1, %xmm1 |
49 | test %edx, %edx |
50 | lea 16(%rax), %rax |
51 | jnz L(exit) |
52 | |
53 | pcmpeqd (%rax), %xmm1 |
54 | pmovmskb %xmm1, %edx |
55 | pxor %xmm2, %xmm2 |
56 | test %edx, %edx |
57 | lea 16(%rax), %rax |
58 | jnz L(exit) |
59 | |
60 | pcmpeqd (%rax), %xmm2 |
61 | pmovmskb %xmm2, %edx |
62 | pxor %xmm3, %xmm3 |
63 | test %edx, %edx |
64 | lea 16(%rax), %rax |
65 | jnz L(exit) |
66 | |
67 | pcmpeqd (%rax), %xmm3 |
68 | pmovmskb %xmm3, %edx |
69 | test %edx, %edx |
70 | lea 16(%rax), %rax |
71 | jnz L(exit) |
72 | |
73 | pcmpeqd (%rax), %xmm0 |
74 | pmovmskb %xmm0, %edx |
75 | test %edx, %edx |
76 | lea 16(%rax), %rax |
77 | jnz L(exit) |
78 | |
79 | pcmpeqd (%rax), %xmm1 |
80 | pmovmskb %xmm1, %edx |
81 | test %edx, %edx |
82 | lea 16(%rax), %rax |
83 | jnz L(exit) |
84 | |
85 | pcmpeqd (%rax), %xmm2 |
86 | pmovmskb %xmm2, %edx |
87 | test %edx, %edx |
88 | lea 16(%rax), %rax |
89 | jnz L(exit) |
90 | |
91 | pcmpeqd (%rax), %xmm3 |
92 | pmovmskb %xmm3, %edx |
93 | test %edx, %edx |
94 | lea 16(%rax), %rax |
95 | jnz L(exit) |
96 | |
97 | pcmpeqd (%rax), %xmm0 |
98 | pmovmskb %xmm0, %edx |
99 | test %edx, %edx |
100 | lea 16(%rax), %rax |
101 | jnz L(exit) |
102 | |
103 | pcmpeqd (%rax), %xmm1 |
104 | pmovmskb %xmm1, %edx |
105 | test %edx, %edx |
106 | lea 16(%rax), %rax |
107 | jnz L(exit) |
108 | |
109 | pcmpeqd (%rax), %xmm2 |
110 | pmovmskb %xmm2, %edx |
111 | test %edx, %edx |
112 | lea 16(%rax), %rax |
113 | jnz L(exit) |
114 | |
115 | pcmpeqd (%rax), %xmm3 |
116 | pmovmskb %xmm3, %edx |
117 | test %edx, %edx |
118 | lea 16(%rax), %rax |
119 | jnz L(exit) |
120 | |
121 | and $-0x40, %rax |
122 | |
123 | .p2align 4 |
124 | L(aligned_64_loop): |
125 | movaps (%rax), %xmm0 |
126 | movaps 16(%rax), %xmm1 |
127 | movaps 32(%rax), %xmm2 |
128 | movaps 48(%rax), %xmm6 |
129 | |
130 | pminub %xmm1, %xmm0 |
131 | pminub %xmm6, %xmm2 |
132 | pminub %xmm0, %xmm2 |
133 | pcmpeqd %xmm3, %xmm2 |
134 | pmovmskb %xmm2, %edx |
135 | test %edx, %edx |
136 | lea 64(%rax), %rax |
137 | jz L(aligned_64_loop) |
138 | |
139 | pcmpeqd -64(%rax), %xmm3 |
140 | pmovmskb %xmm3, %edx |
141 | test %edx, %edx |
142 | lea 48(%rcx), %rcx |
143 | jnz L(exit) |
144 | |
145 | pcmpeqd %xmm1, %xmm3 |
146 | pmovmskb %xmm3, %edx |
147 | test %edx, %edx |
148 | lea -16(%rcx), %rcx |
149 | jnz L(exit) |
150 | |
151 | pcmpeqd -32(%rax), %xmm3 |
152 | pmovmskb %xmm3, %edx |
153 | test %edx, %edx |
154 | lea -16(%rcx), %rcx |
155 | jnz L(exit) |
156 | |
157 | pcmpeqd %xmm6, %xmm3 |
158 | pmovmskb %xmm3, %edx |
159 | test %edx, %edx |
160 | lea -16(%rcx), %rcx |
161 | jnz L(exit) |
162 | |
163 | jmp L(aligned_64_loop) |
164 | |
165 | .p2align 4 |
166 | L(exit): |
167 | sub %rcx, %rax |
168 | shr $2, %rax |
169 | test %dl, %dl |
170 | jz L(exit_high) |
171 | |
172 | mov %dl, %cl |
173 | and $15, %cl |
174 | jz L(exit_1) |
175 | ret |
176 | |
177 | .p2align 4 |
178 | L(exit_high): |
179 | mov %dh, %ch |
180 | and $15, %ch |
181 | jz L(exit_3) |
182 | add $2, %rax |
183 | ret |
184 | |
185 | .p2align 4 |
186 | L(exit_1): |
187 | add $1, %rax |
188 | ret |
189 | |
190 | .p2align 4 |
191 | L(exit_3): |
192 | add $3, %rax |
193 | ret |
194 | |
195 | .p2align 4 |
196 | L(exit_tail0): |
197 | xor %rax, %rax |
198 | ret |
199 | |
200 | .p2align 4 |
201 | L(exit_tail1): |
202 | mov $1, %rax |
203 | ret |
204 | |
205 | .p2align 4 |
206 | L(exit_tail2): |
207 | mov $2, %rax |
208 | ret |
209 | |
210 | .p2align 4 |
211 | L(exit_tail3): |
212 | mov $3, %rax |
213 | ret |
214 | |
215 | .p2align 4 |
216 | L(exit_tail4): |
217 | mov $4, %rax |
218 | ret |
219 | |
220 | .p2align 4 |
221 | L(exit_tail5): |
222 | mov $5, %rax |
223 | ret |
224 | |
225 | .p2align 4 |
226 | L(exit_tail6): |
227 | mov $6, %rax |
228 | ret |
229 | |
230 | .p2align 4 |
231 | L(exit_tail7): |
232 | mov $7, %rax |
233 | ret |
234 | |
235 | END (__wcslen) |
236 | |
237 | weak_alias(__wcslen, wcslen) |
238 | |