1 | /* Optimized wcslen for x86-64 with SSE2. |
2 | Copyright (C) 2011-2021 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | .text |
23 | ENTRY (__wcslen) |
24 | cmpl $0, (%rdi) |
25 | jz L(exit_tail0) |
26 | cmpl $0, 4(%rdi) |
27 | jz L(exit_tail1) |
28 | cmpl $0, 8(%rdi) |
29 | jz L(exit_tail2) |
30 | cmpl $0, 12(%rdi) |
31 | jz L(exit_tail3) |
32 | cmpl $0, 16(%rdi) |
33 | jz L(exit_tail4) |
34 | cmpl $0, 20(%rdi) |
35 | jz L(exit_tail5) |
36 | cmpl $0, 24(%rdi) |
37 | jz L(exit_tail6) |
38 | cmpl $0, 28(%rdi) |
39 | jz L(exit_tail7) |
40 | |
41 | pxor %xmm0, %xmm0 |
42 | |
43 | lea 32(%rdi), %rax |
44 | lea 16(%rdi), %rcx |
45 | and $-16, %rax |
46 | |
47 | pcmpeqd (%rax), %xmm0 |
48 | pmovmskb %xmm0, %edx |
49 | pxor %xmm1, %xmm1 |
50 | test %edx, %edx |
51 | lea 16(%rax), %rax |
52 | jnz L(exit) |
53 | |
54 | pcmpeqd (%rax), %xmm1 |
55 | pmovmskb %xmm1, %edx |
56 | pxor %xmm2, %xmm2 |
57 | test %edx, %edx |
58 | lea 16(%rax), %rax |
59 | jnz L(exit) |
60 | |
61 | pcmpeqd (%rax), %xmm2 |
62 | pmovmskb %xmm2, %edx |
63 | pxor %xmm3, %xmm3 |
64 | test %edx, %edx |
65 | lea 16(%rax), %rax |
66 | jnz L(exit) |
67 | |
68 | pcmpeqd (%rax), %xmm3 |
69 | pmovmskb %xmm3, %edx |
70 | test %edx, %edx |
71 | lea 16(%rax), %rax |
72 | jnz L(exit) |
73 | |
74 | pcmpeqd (%rax), %xmm0 |
75 | pmovmskb %xmm0, %edx |
76 | test %edx, %edx |
77 | lea 16(%rax), %rax |
78 | jnz L(exit) |
79 | |
80 | pcmpeqd (%rax), %xmm1 |
81 | pmovmskb %xmm1, %edx |
82 | test %edx, %edx |
83 | lea 16(%rax), %rax |
84 | jnz L(exit) |
85 | |
86 | pcmpeqd (%rax), %xmm2 |
87 | pmovmskb %xmm2, %edx |
88 | test %edx, %edx |
89 | lea 16(%rax), %rax |
90 | jnz L(exit) |
91 | |
92 | pcmpeqd (%rax), %xmm3 |
93 | pmovmskb %xmm3, %edx |
94 | test %edx, %edx |
95 | lea 16(%rax), %rax |
96 | jnz L(exit) |
97 | |
98 | pcmpeqd (%rax), %xmm0 |
99 | pmovmskb %xmm0, %edx |
100 | test %edx, %edx |
101 | lea 16(%rax), %rax |
102 | jnz L(exit) |
103 | |
104 | pcmpeqd (%rax), %xmm1 |
105 | pmovmskb %xmm1, %edx |
106 | test %edx, %edx |
107 | lea 16(%rax), %rax |
108 | jnz L(exit) |
109 | |
110 | pcmpeqd (%rax), %xmm2 |
111 | pmovmskb %xmm2, %edx |
112 | test %edx, %edx |
113 | lea 16(%rax), %rax |
114 | jnz L(exit) |
115 | |
116 | pcmpeqd (%rax), %xmm3 |
117 | pmovmskb %xmm3, %edx |
118 | test %edx, %edx |
119 | lea 16(%rax), %rax |
120 | jnz L(exit) |
121 | |
122 | and $-0x40, %rax |
123 | |
124 | .p2align 4 |
125 | L(aligned_64_loop): |
126 | movaps (%rax), %xmm0 |
127 | movaps 16(%rax), %xmm1 |
128 | movaps 32(%rax), %xmm2 |
129 | movaps 48(%rax), %xmm6 |
130 | |
131 | pminub %xmm1, %xmm0 |
132 | pminub %xmm6, %xmm2 |
133 | pminub %xmm0, %xmm2 |
134 | pcmpeqd %xmm3, %xmm2 |
135 | pmovmskb %xmm2, %edx |
136 | test %edx, %edx |
137 | lea 64(%rax), %rax |
138 | jz L(aligned_64_loop) |
139 | |
140 | pcmpeqd -64(%rax), %xmm3 |
141 | pmovmskb %xmm3, %edx |
142 | test %edx, %edx |
143 | lea 48(%rcx), %rcx |
144 | jnz L(exit) |
145 | |
146 | pcmpeqd %xmm1, %xmm3 |
147 | pmovmskb %xmm3, %edx |
148 | test %edx, %edx |
149 | lea -16(%rcx), %rcx |
150 | jnz L(exit) |
151 | |
152 | pcmpeqd -32(%rax), %xmm3 |
153 | pmovmskb %xmm3, %edx |
154 | test %edx, %edx |
155 | lea -16(%rcx), %rcx |
156 | jnz L(exit) |
157 | |
158 | pcmpeqd %xmm6, %xmm3 |
159 | pmovmskb %xmm3, %edx |
160 | test %edx, %edx |
161 | lea -16(%rcx), %rcx |
162 | jnz L(exit) |
163 | |
164 | jmp L(aligned_64_loop) |
165 | |
166 | .p2align 4 |
167 | L(exit): |
168 | sub %rcx, %rax |
169 | shr $2, %rax |
170 | test %dl, %dl |
171 | jz L(exit_high) |
172 | |
173 | mov %dl, %cl |
174 | and $15, %cl |
175 | jz L(exit_1) |
176 | ret |
177 | |
178 | .p2align 4 |
179 | L(exit_high): |
180 | mov %dh, %ch |
181 | and $15, %ch |
182 | jz L(exit_3) |
183 | add $2, %rax |
184 | ret |
185 | |
186 | .p2align 4 |
187 | L(exit_1): |
188 | add $1, %rax |
189 | ret |
190 | |
191 | .p2align 4 |
192 | L(exit_3): |
193 | add $3, %rax |
194 | ret |
195 | |
196 | .p2align 4 |
197 | L(exit_tail0): |
198 | xor %rax, %rax |
199 | ret |
200 | |
201 | .p2align 4 |
202 | L(exit_tail1): |
203 | mov $1, %rax |
204 | ret |
205 | |
206 | .p2align 4 |
207 | L(exit_tail2): |
208 | mov $2, %rax |
209 | ret |
210 | |
211 | .p2align 4 |
212 | L(exit_tail3): |
213 | mov $3, %rax |
214 | ret |
215 | |
216 | .p2align 4 |
217 | L(exit_tail4): |
218 | mov $4, %rax |
219 | ret |
220 | |
221 | .p2align 4 |
222 | L(exit_tail5): |
223 | mov $5, %rax |
224 | ret |
225 | |
226 | .p2align 4 |
227 | L(exit_tail6): |
228 | mov $6, %rax |
229 | ret |
230 | |
231 | .p2align 4 |
232 | L(exit_tail7): |
233 | mov $7, %rax |
234 | ret |
235 | |
236 | END (__wcslen) |
237 | |
238 | weak_alias(__wcslen, wcslen) |
239 | |