1 | /* wcsrchr with SSSE3 |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | .text |
22 | ENTRY (wcsrchr) |
23 | |
24 | movd %rsi, %xmm1 |
25 | mov %rdi, %rcx |
26 | punpckldq %xmm1, %xmm1 |
27 | pxor %xmm2, %xmm2 |
28 | punpckldq %xmm1, %xmm1 |
29 | and $63, %rcx |
30 | cmp $48, %rcx |
31 | ja L(crosscache) |
32 | |
33 | movdqu (%rdi), %xmm0 |
34 | pcmpeqd %xmm0, %xmm2 |
35 | pcmpeqd %xmm1, %xmm0 |
36 | pmovmskb %xmm2, %rcx |
37 | pmovmskb %xmm0, %rax |
38 | add $16, %rdi |
39 | |
40 | test %rax, %rax |
41 | jnz L(unaligned_match1) |
42 | |
43 | test %rcx, %rcx |
44 | jnz L(return_null) |
45 | |
46 | and $-16, %rdi |
47 | xor %r8, %r8 |
48 | jmp L(loop) |
49 | |
50 | .p2align 4 |
51 | L(unaligned_match1): |
52 | test %rcx, %rcx |
53 | jnz L(prolog_find_zero_1) |
54 | |
55 | mov %rax, %r8 |
56 | mov %rdi, %rsi |
57 | and $-16, %rdi |
58 | jmp L(loop) |
59 | |
60 | .p2align 4 |
61 | L(crosscache): |
62 | and $15, %rcx |
63 | and $-16, %rdi |
64 | pxor %xmm3, %xmm3 |
65 | movdqa (%rdi), %xmm0 |
66 | pcmpeqd %xmm0, %xmm3 |
67 | pcmpeqd %xmm1, %xmm0 |
68 | pmovmskb %xmm3, %rdx |
69 | pmovmskb %xmm0, %rax |
70 | shr %cl, %rdx |
71 | shr %cl, %rax |
72 | add $16, %rdi |
73 | |
74 | test %rax, %rax |
75 | jnz L(unaligned_match) |
76 | |
77 | test %rdx, %rdx |
78 | jnz L(return_null) |
79 | |
80 | xor %r8, %r8 |
81 | jmp L(loop) |
82 | |
83 | .p2align 4 |
84 | L(unaligned_match): |
85 | test %rdx, %rdx |
86 | jnz L(prolog_find_zero) |
87 | |
88 | mov %rax, %r8 |
89 | lea (%rdi, %rcx), %rsi |
90 | |
91 | /* Loop start on aligned string. */ |
92 | .p2align 4 |
93 | L(loop): |
94 | movdqa (%rdi), %xmm0 |
95 | pcmpeqd %xmm0, %xmm2 |
96 | add $16, %rdi |
97 | pcmpeqd %xmm1, %xmm0 |
98 | pmovmskb %xmm2, %rcx |
99 | pmovmskb %xmm0, %rax |
100 | or %rax, %rcx |
101 | jnz L(matches) |
102 | |
103 | movdqa (%rdi), %xmm3 |
104 | pcmpeqd %xmm3, %xmm2 |
105 | add $16, %rdi |
106 | pcmpeqd %xmm1, %xmm3 |
107 | pmovmskb %xmm2, %rcx |
108 | pmovmskb %xmm3, %rax |
109 | or %rax, %rcx |
110 | jnz L(matches) |
111 | |
112 | movdqa (%rdi), %xmm4 |
113 | pcmpeqd %xmm4, %xmm2 |
114 | add $16, %rdi |
115 | pcmpeqd %xmm1, %xmm4 |
116 | pmovmskb %xmm2, %rcx |
117 | pmovmskb %xmm4, %rax |
118 | or %rax, %rcx |
119 | jnz L(matches) |
120 | |
121 | movdqa (%rdi), %xmm5 |
122 | pcmpeqd %xmm5, %xmm2 |
123 | add $16, %rdi |
124 | pcmpeqd %xmm1, %xmm5 |
125 | pmovmskb %xmm2, %rcx |
126 | pmovmskb %xmm5, %rax |
127 | or %rax, %rcx |
128 | jz L(loop) |
129 | |
130 | .p2align 4 |
131 | L(matches): |
132 | test %rax, %rax |
133 | jnz L(match) |
134 | L(return_value): |
135 | test %r8, %r8 |
136 | jz L(return_null) |
137 | mov %r8, %rax |
138 | mov %rsi, %rdi |
139 | |
140 | test $15 << 4, %ah |
141 | jnz L(match_fourth_wchar) |
142 | test %ah, %ah |
143 | jnz L(match_third_wchar) |
144 | test $15 << 4, %al |
145 | jnz L(match_second_wchar) |
146 | lea -16(%rdi), %rax |
147 | ret |
148 | |
149 | .p2align 4 |
150 | L(match): |
151 | pmovmskb %xmm2, %rcx |
152 | test %rcx, %rcx |
153 | jnz L(find_zero) |
154 | mov %rax, %r8 |
155 | mov %rdi, %rsi |
156 | jmp L(loop) |
157 | |
158 | .p2align 4 |
159 | L(find_zero): |
160 | test $15, %cl |
161 | jnz L(find_zero_in_first_wchar) |
162 | test %cl, %cl |
163 | jnz L(find_zero_in_second_wchar) |
164 | test $15, %ch |
165 | jnz L(find_zero_in_third_wchar) |
166 | |
167 | and $1 << 13 - 1, %rax |
168 | jz L(return_value) |
169 | |
170 | test $15 << 4, %ah |
171 | jnz L(match_fourth_wchar) |
172 | test %ah, %ah |
173 | jnz L(match_third_wchar) |
174 | test $15 << 4, %al |
175 | jnz L(match_second_wchar) |
176 | lea -16(%rdi), %rax |
177 | ret |
178 | |
179 | .p2align 4 |
180 | L(find_zero_in_first_wchar): |
181 | test $1, %rax |
182 | jz L(return_value) |
183 | lea -16(%rdi), %rax |
184 | ret |
185 | |
186 | .p2align 4 |
187 | L(find_zero_in_second_wchar): |
188 | and $1 << 5 - 1, %rax |
189 | jz L(return_value) |
190 | |
191 | test $15 << 4, %al |
192 | jnz L(match_second_wchar) |
193 | lea -16(%rdi), %rax |
194 | ret |
195 | |
196 | .p2align 4 |
197 | L(find_zero_in_third_wchar): |
198 | and $1 << 9 - 1, %rax |
199 | jz L(return_value) |
200 | |
201 | test %ah, %ah |
202 | jnz L(match_third_wchar) |
203 | test $15 << 4, %al |
204 | jnz L(match_second_wchar) |
205 | lea -16(%rdi), %rax |
206 | ret |
207 | |
208 | .p2align 4 |
209 | L(prolog_find_zero): |
210 | add %rcx, %rdi |
211 | mov %rdx, %rcx |
212 | L(prolog_find_zero_1): |
213 | test $15, %cl |
214 | jnz L(prolog_find_zero_in_first_wchar) |
215 | test %cl, %cl |
216 | jnz L(prolog_find_zero_in_second_wchar) |
217 | test $15, %ch |
218 | jnz L(prolog_find_zero_in_third_wchar) |
219 | |
220 | and $1 << 13 - 1, %rax |
221 | jz L(return_null) |
222 | |
223 | test $15 << 4, %ah |
224 | jnz L(match_fourth_wchar) |
225 | test %ah, %ah |
226 | jnz L(match_third_wchar) |
227 | test $15 << 4, %al |
228 | jnz L(match_second_wchar) |
229 | lea -16(%rdi), %rax |
230 | ret |
231 | |
232 | .p2align 4 |
233 | L(prolog_find_zero_in_first_wchar): |
234 | test $1, %rax |
235 | jz L(return_null) |
236 | lea -16(%rdi), %rax |
237 | ret |
238 | |
239 | .p2align 4 |
240 | L(prolog_find_zero_in_second_wchar): |
241 | and $1 << 5 - 1, %rax |
242 | jz L(return_null) |
243 | |
244 | test $15 << 4, %al |
245 | jnz L(match_second_wchar) |
246 | lea -16(%rdi), %rax |
247 | ret |
248 | |
249 | .p2align 4 |
250 | L(prolog_find_zero_in_third_wchar): |
251 | and $1 << 9 - 1, %rax |
252 | jz L(return_null) |
253 | |
254 | test %ah, %ah |
255 | jnz L(match_third_wchar) |
256 | test $15 << 4, %al |
257 | jnz L(match_second_wchar) |
258 | lea -16(%rdi), %rax |
259 | ret |
260 | |
261 | .p2align 4 |
262 | L(match_second_wchar): |
263 | lea -12(%rdi), %rax |
264 | ret |
265 | |
266 | .p2align 4 |
267 | L(match_third_wchar): |
268 | lea -8(%rdi), %rax |
269 | ret |
270 | |
271 | .p2align 4 |
272 | L(match_fourth_wchar): |
273 | lea -4(%rdi), %rax |
274 | ret |
275 | |
276 | .p2align 4 |
277 | L(return_null): |
278 | xor %rax, %rax |
279 | ret |
280 | |
281 | END (wcsrchr) |
282 | |