1/* wcsrchr with SSSE3
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21 .text
22ENTRY (wcsrchr)
23
24 movd %rsi, %xmm1
25 mov %rdi, %rcx
26 punpckldq %xmm1, %xmm1
27 pxor %xmm2, %xmm2
28 punpckldq %xmm1, %xmm1
29 and $63, %rcx
30 cmp $48, %rcx
31 ja L(crosscache)
32
33 movdqu (%rdi), %xmm0
34 pcmpeqd %xmm0, %xmm2
35 pcmpeqd %xmm1, %xmm0
36 pmovmskb %xmm2, %rcx
37 pmovmskb %xmm0, %rax
38 add $16, %rdi
39
40 test %rax, %rax
41 jnz L(unaligned_match1)
42
43 test %rcx, %rcx
44 jnz L(return_null)
45
46 and $-16, %rdi
47 xor %r8, %r8
48 jmp L(loop)
49
50 .p2align 4
51L(unaligned_match1):
52 test %rcx, %rcx
53 jnz L(prolog_find_zero_1)
54
55 mov %rax, %r8
56 mov %rdi, %rsi
57 and $-16, %rdi
58 jmp L(loop)
59
60 .p2align 4
61L(crosscache):
62 and $15, %rcx
63 and $-16, %rdi
64 pxor %xmm3, %xmm3
65 movdqa (%rdi), %xmm0
66 pcmpeqd %xmm0, %xmm3
67 pcmpeqd %xmm1, %xmm0
68 pmovmskb %xmm3, %rdx
69 pmovmskb %xmm0, %rax
70 shr %cl, %rdx
71 shr %cl, %rax
72 add $16, %rdi
73
74 test %rax, %rax
75 jnz L(unaligned_match)
76
77 test %rdx, %rdx
78 jnz L(return_null)
79
80 xor %r8, %r8
81 jmp L(loop)
82
83 .p2align 4
84L(unaligned_match):
85 test %rdx, %rdx
86 jnz L(prolog_find_zero)
87
88 mov %rax, %r8
89 lea (%rdi, %rcx), %rsi
90
91/* Loop start on aligned string. */
92 .p2align 4
93L(loop):
94 movdqa (%rdi), %xmm0
95 pcmpeqd %xmm0, %xmm2
96 add $16, %rdi
97 pcmpeqd %xmm1, %xmm0
98 pmovmskb %xmm2, %rcx
99 pmovmskb %xmm0, %rax
100 or %rax, %rcx
101 jnz L(matches)
102
103 movdqa (%rdi), %xmm3
104 pcmpeqd %xmm3, %xmm2
105 add $16, %rdi
106 pcmpeqd %xmm1, %xmm3
107 pmovmskb %xmm2, %rcx
108 pmovmskb %xmm3, %rax
109 or %rax, %rcx
110 jnz L(matches)
111
112 movdqa (%rdi), %xmm4
113 pcmpeqd %xmm4, %xmm2
114 add $16, %rdi
115 pcmpeqd %xmm1, %xmm4
116 pmovmskb %xmm2, %rcx
117 pmovmskb %xmm4, %rax
118 or %rax, %rcx
119 jnz L(matches)
120
121 movdqa (%rdi), %xmm5
122 pcmpeqd %xmm5, %xmm2
123 add $16, %rdi
124 pcmpeqd %xmm1, %xmm5
125 pmovmskb %xmm2, %rcx
126 pmovmskb %xmm5, %rax
127 or %rax, %rcx
128 jz L(loop)
129
130 .p2align 4
131L(matches):
132 test %rax, %rax
133 jnz L(match)
134L(return_value):
135 test %r8, %r8
136 jz L(return_null)
137 mov %r8, %rax
138 mov %rsi, %rdi
139
140 test $15 << 4, %ah
141 jnz L(match_fourth_wchar)
142 test %ah, %ah
143 jnz L(match_third_wchar)
144 test $15 << 4, %al
145 jnz L(match_second_wchar)
146 lea -16(%rdi), %rax
147 ret
148
149 .p2align 4
150L(match):
151 pmovmskb %xmm2, %rcx
152 test %rcx, %rcx
153 jnz L(find_zero)
154 mov %rax, %r8
155 mov %rdi, %rsi
156 jmp L(loop)
157
158 .p2align 4
159L(find_zero):
160 test $15, %cl
161 jnz L(find_zero_in_first_wchar)
162 test %cl, %cl
163 jnz L(find_zero_in_second_wchar)
164 test $15, %ch
165 jnz L(find_zero_in_third_wchar)
166
167 and $1 << 13 - 1, %rax
168 jz L(return_value)
169
170 test $15 << 4, %ah
171 jnz L(match_fourth_wchar)
172 test %ah, %ah
173 jnz L(match_third_wchar)
174 test $15 << 4, %al
175 jnz L(match_second_wchar)
176 lea -16(%rdi), %rax
177 ret
178
179 .p2align 4
180L(find_zero_in_first_wchar):
181 test $1, %rax
182 jz L(return_value)
183 lea -16(%rdi), %rax
184 ret
185
186 .p2align 4
187L(find_zero_in_second_wchar):
188 and $1 << 5 - 1, %rax
189 jz L(return_value)
190
191 test $15 << 4, %al
192 jnz L(match_second_wchar)
193 lea -16(%rdi), %rax
194 ret
195
196 .p2align 4
197L(find_zero_in_third_wchar):
198 and $1 << 9 - 1, %rax
199 jz L(return_value)
200
201 test %ah, %ah
202 jnz L(match_third_wchar)
203 test $15 << 4, %al
204 jnz L(match_second_wchar)
205 lea -16(%rdi), %rax
206 ret
207
208 .p2align 4
209L(prolog_find_zero):
210 add %rcx, %rdi
211 mov %rdx, %rcx
212L(prolog_find_zero_1):
213 test $15, %cl
214 jnz L(prolog_find_zero_in_first_wchar)
215 test %cl, %cl
216 jnz L(prolog_find_zero_in_second_wchar)
217 test $15, %ch
218 jnz L(prolog_find_zero_in_third_wchar)
219
220 and $1 << 13 - 1, %rax
221 jz L(return_null)
222
223 test $15 << 4, %ah
224 jnz L(match_fourth_wchar)
225 test %ah, %ah
226 jnz L(match_third_wchar)
227 test $15 << 4, %al
228 jnz L(match_second_wchar)
229 lea -16(%rdi), %rax
230 ret
231
232 .p2align 4
233L(prolog_find_zero_in_first_wchar):
234 test $1, %rax
235 jz L(return_null)
236 lea -16(%rdi), %rax
237 ret
238
239 .p2align 4
240L(prolog_find_zero_in_second_wchar):
241 and $1 << 5 - 1, %rax
242 jz L(return_null)
243
244 test $15 << 4, %al
245 jnz L(match_second_wchar)
246 lea -16(%rdi), %rax
247 ret
248
249 .p2align 4
250L(prolog_find_zero_in_third_wchar):
251 and $1 << 9 - 1, %rax
252 jz L(return_null)
253
254 test %ah, %ah
255 jnz L(match_third_wchar)
256 test $15 << 4, %al
257 jnz L(match_second_wchar)
258 lea -16(%rdi), %rax
259 ret
260
261 .p2align 4
262L(match_second_wchar):
263 lea -12(%rdi), %rax
264 ret
265
266 .p2align 4
267L(match_third_wchar):
268 lea -8(%rdi), %rax
269 ret
270
271 .p2align 4
272L(match_fourth_wchar):
273 lea -4(%rdi), %rax
274 ret
275
276 .p2align 4
277L(return_null):
278 xor %rax, %rax
279 ret
280
281END (wcsrchr)
282