1 | /* wcslen optimized with SSE2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (1) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | #ifndef WCSLEN |
26 | # define WCSLEN __wcslen_sse2 |
27 | #endif |
28 | |
29 | |
30 | .text |
31 | ENTRY (WCSLEN) |
32 | cmpl $0, (%rdi) |
33 | jz L(exit_tail0) |
34 | cmpl $0, 4(%rdi) |
35 | jz L(exit_tail1) |
36 | cmpl $0, 8(%rdi) |
37 | jz L(exit_tail2) |
38 | cmpl $0, 12(%rdi) |
39 | jz L(exit_tail3) |
40 | cmpl $0, 16(%rdi) |
41 | jz L(exit_tail4) |
42 | cmpl $0, 20(%rdi) |
43 | jz L(exit_tail5) |
44 | cmpl $0, 24(%rdi) |
45 | jz L(exit_tail6) |
46 | cmpl $0, 28(%rdi) |
47 | jz L(exit_tail7) |
48 | |
49 | pxor %xmm0, %xmm0 |
50 | |
51 | lea 32(%rdi), %rax |
52 | addq $16, %rdi |
53 | and $-16, %rax |
54 | |
55 | pcmpeqd (%rax), %xmm0 |
56 | pmovmskb %xmm0, %edx |
57 | pxor %xmm1, %xmm1 |
58 | addq $16, %rax |
59 | test %edx, %edx |
60 | jnz L(exit) |
61 | |
62 | pcmpeqd (%rax), %xmm1 |
63 | pmovmskb %xmm1, %edx |
64 | pxor %xmm2, %xmm2 |
65 | addq $16, %rax |
66 | test %edx, %edx |
67 | jnz L(exit) |
68 | |
69 | pcmpeqd (%rax), %xmm2 |
70 | pmovmskb %xmm2, %edx |
71 | pxor %xmm3, %xmm3 |
72 | addq $16, %rax |
73 | test %edx, %edx |
74 | jnz L(exit) |
75 | |
76 | pcmpeqd (%rax), %xmm3 |
77 | pmovmskb %xmm3, %edx |
78 | addq $16, %rax |
79 | test %edx, %edx |
80 | jnz L(exit) |
81 | |
82 | pcmpeqd (%rax), %xmm0 |
83 | pmovmskb %xmm0, %edx |
84 | addq $16, %rax |
85 | test %edx, %edx |
86 | jnz L(exit) |
87 | |
88 | pcmpeqd (%rax), %xmm1 |
89 | pmovmskb %xmm1, %edx |
90 | addq $16, %rax |
91 | test %edx, %edx |
92 | jnz L(exit) |
93 | |
94 | pcmpeqd (%rax), %xmm2 |
95 | pmovmskb %xmm2, %edx |
96 | addq $16, %rax |
97 | test %edx, %edx |
98 | jnz L(exit) |
99 | |
100 | pcmpeqd (%rax), %xmm3 |
101 | pmovmskb %xmm3, %edx |
102 | addq $16, %rax |
103 | test %edx, %edx |
104 | jnz L(exit) |
105 | |
106 | pcmpeqd (%rax), %xmm0 |
107 | pmovmskb %xmm0, %edx |
108 | addq $16, %rax |
109 | test %edx, %edx |
110 | jnz L(exit) |
111 | |
112 | pcmpeqd (%rax), %xmm1 |
113 | pmovmskb %xmm1, %edx |
114 | addq $16, %rax |
115 | test %edx, %edx |
116 | jnz L(exit) |
117 | |
118 | pcmpeqd (%rax), %xmm2 |
119 | pmovmskb %xmm2, %edx |
120 | addq $16, %rax |
121 | test %edx, %edx |
122 | jnz L(exit) |
123 | |
124 | pcmpeqd (%rax), %xmm3 |
125 | pmovmskb %xmm3, %edx |
126 | addq $16, %rax |
127 | test %edx, %edx |
128 | jnz L(exit) |
129 | |
130 | and $-0x40, %rax |
131 | |
132 | .p2align 4 |
133 | L(aligned_64_loop): |
134 | movaps (%rax), %xmm0 |
135 | movaps 16(%rax), %xmm1 |
136 | movaps 32(%rax), %xmm2 |
137 | movaps 48(%rax), %xmm6 |
138 | |
139 | pminub %xmm1, %xmm0 |
140 | pminub %xmm6, %xmm2 |
141 | pminub %xmm0, %xmm2 |
142 | pcmpeqd %xmm3, %xmm2 |
143 | pmovmskb %xmm2, %edx |
144 | addq $64, %rax |
145 | test %edx, %edx |
146 | jz L(aligned_64_loop) |
147 | |
148 | pcmpeqd -64(%rax), %xmm3 |
149 | pmovmskb %xmm3, %edx |
150 | addq $48, %rdi |
151 | test %edx, %edx |
152 | jnz L(exit) |
153 | |
154 | pcmpeqd %xmm1, %xmm3 |
155 | pmovmskb %xmm3, %edx |
156 | addq $-16, %rdi |
157 | test %edx, %edx |
158 | jnz L(exit) |
159 | |
160 | pcmpeqd -32(%rax), %xmm3 |
161 | pmovmskb %xmm3, %edx |
162 | addq $-16, %rdi |
163 | test %edx, %edx |
164 | jnz L(exit) |
165 | |
166 | pcmpeqd %xmm6, %xmm3 |
167 | pmovmskb %xmm3, %edx |
168 | addq $-16, %rdi |
169 | test %edx, %edx |
170 | jz L(aligned_64_loop) |
171 | |
172 | .p2align 4 |
173 | L(exit): |
174 | sub %rdi, %rax |
175 | shr $2, %rax |
176 | test %dl, %dl |
177 | jz L(exit_high) |
178 | |
179 | andl $15, %edx |
180 | jz L(exit_1) |
181 | ret |
182 | |
183 | /* No align here. Naturally aligned % 16 == 1. */ |
184 | L(exit_high): |
185 | andl $(15 << 8), %edx |
186 | jz L(exit_3) |
187 | add $2, %rax |
188 | ret |
189 | |
190 | .p2align 3 |
191 | L(exit_1): |
192 | add $1, %rax |
193 | ret |
194 | |
195 | .p2align 3 |
196 | L(exit_3): |
197 | add $3, %rax |
198 | ret |
199 | |
200 | .p2align 3 |
201 | L(exit_tail0): |
202 | xorl %eax, %eax |
203 | ret |
204 | |
205 | .p2align 3 |
206 | L(exit_tail1): |
207 | movl $1, %eax |
208 | ret |
209 | |
210 | .p2align 3 |
211 | L(exit_tail2): |
212 | movl $2, %eax |
213 | ret |
214 | |
215 | .p2align 3 |
216 | L(exit_tail3): |
217 | movl $3, %eax |
218 | ret |
219 | |
220 | .p2align 3 |
221 | L(exit_tail4): |
222 | movl $4, %eax |
223 | ret |
224 | |
225 | .p2align 3 |
226 | L(exit_tail5): |
227 | movl $5, %eax |
228 | ret |
229 | |
230 | .p2align 3 |
231 | L(exit_tail6): |
232 | movl $6, %eax |
233 | ret |
234 | |
235 | .p2align 3 |
236 | L(exit_tail7): |
237 | movl $7, %eax |
238 | ret |
239 | |
240 | END (WCSLEN) |
241 | |
242 | #endif |
243 | |