1 | /* fast SSE2 memchr with 64 byte loop and pmaxub instruction using |
2 | |
3 | Copyright (C) 2011-2021 Free Software Foundation, Inc. |
4 | Contributed by Intel Corporation. |
5 | This file is part of the GNU C Library. |
6 | |
7 | The GNU C Library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Lesser General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2.1 of the License, or (at your option) any later version. |
11 | |
12 | The GNU C Library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Lesser General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Lesser General Public |
18 | License along with the GNU C Library; if not, see |
19 | <https://www.gnu.org/licenses/>. */ |
20 | |
21 | #include <sysdep.h> |
22 | |
23 | .text |
24 | ENTRY (__rawmemchr) |
25 | movd %rsi, %xmm1 |
26 | mov %rdi, %rcx |
27 | |
28 | punpcklbw %xmm1, %xmm1 |
29 | punpcklbw %xmm1, %xmm1 |
30 | |
31 | and $63, %rcx |
32 | pshufd $0, %xmm1, %xmm1 |
33 | |
34 | cmp $48, %rcx |
35 | ja L(crosscache) |
36 | |
37 | movdqu (%rdi), %xmm0 |
38 | pcmpeqb %xmm1, %xmm0 |
39 | /* Check if there is a match. */ |
40 | pmovmskb %xmm0, %eax |
41 | test %eax, %eax |
42 | |
43 | jnz L(matches) |
44 | add $16, %rdi |
45 | and $-16, %rdi |
46 | jmp L(loop_prolog) |
47 | |
48 | .p2align 4 |
49 | L(crosscache): |
50 | and $15, %rcx |
51 | and $-16, %rdi |
52 | movdqa (%rdi), %xmm0 |
53 | |
54 | pcmpeqb %xmm1, %xmm0 |
55 | /* Check if there is a match. */ |
56 | pmovmskb %xmm0, %eax |
57 | /* Remove the leading bytes. */ |
58 | sar %cl, %eax |
59 | test %eax, %eax |
60 | je L(unaligned_no_match) |
61 | /* Check which byte is a match. */ |
62 | bsf %eax, %eax |
63 | |
64 | add %rdi, %rax |
65 | add %rcx, %rax |
66 | ret |
67 | |
68 | .p2align 4 |
69 | L(unaligned_no_match): |
70 | add $16, %rdi |
71 | |
72 | .p2align 4 |
73 | L(loop_prolog): |
74 | movdqa (%rdi), %xmm0 |
75 | pcmpeqb %xmm1, %xmm0 |
76 | pmovmskb %xmm0, %eax |
77 | test %eax, %eax |
78 | jnz L(matches) |
79 | |
80 | movdqa 16(%rdi), %xmm2 |
81 | pcmpeqb %xmm1, %xmm2 |
82 | pmovmskb %xmm2, %eax |
83 | test %eax, %eax |
84 | jnz L(matches16) |
85 | |
86 | movdqa 32(%rdi), %xmm3 |
87 | pcmpeqb %xmm1, %xmm3 |
88 | pmovmskb %xmm3, %eax |
89 | test %eax, %eax |
90 | jnz L(matches32) |
91 | |
92 | movdqa 48(%rdi), %xmm4 |
93 | pcmpeqb %xmm1, %xmm4 |
94 | add $64, %rdi |
95 | pmovmskb %xmm4, %eax |
96 | test %eax, %eax |
97 | jnz L(matches0) |
98 | |
99 | test $0x3f, %rdi |
100 | jz L(align64_loop) |
101 | |
102 | movdqa (%rdi), %xmm0 |
103 | pcmpeqb %xmm1, %xmm0 |
104 | pmovmskb %xmm0, %eax |
105 | test %eax, %eax |
106 | jnz L(matches) |
107 | |
108 | movdqa 16(%rdi), %xmm2 |
109 | pcmpeqb %xmm1, %xmm2 |
110 | pmovmskb %xmm2, %eax |
111 | test %eax, %eax |
112 | jnz L(matches16) |
113 | |
114 | movdqa 32(%rdi), %xmm3 |
115 | pcmpeqb %xmm1, %xmm3 |
116 | pmovmskb %xmm3, %eax |
117 | test %eax, %eax |
118 | jnz L(matches32) |
119 | |
120 | movdqa 48(%rdi), %xmm3 |
121 | pcmpeqb %xmm1, %xmm3 |
122 | pmovmskb %xmm3, %eax |
123 | |
124 | add $64, %rdi |
125 | test %eax, %eax |
126 | jnz L(matches0) |
127 | |
128 | and $-64, %rdi |
129 | |
130 | .p2align 4 |
131 | L(align64_loop): |
132 | movdqa (%rdi), %xmm0 |
133 | movdqa 16(%rdi), %xmm2 |
134 | movdqa 32(%rdi), %xmm3 |
135 | movdqa 48(%rdi), %xmm4 |
136 | |
137 | pcmpeqb %xmm1, %xmm0 |
138 | pcmpeqb %xmm1, %xmm2 |
139 | pcmpeqb %xmm1, %xmm3 |
140 | pcmpeqb %xmm1, %xmm4 |
141 | |
142 | pmaxub %xmm0, %xmm3 |
143 | pmaxub %xmm2, %xmm4 |
144 | pmaxub %xmm3, %xmm4 |
145 | pmovmskb %xmm4, %eax |
146 | |
147 | add $64, %rdi |
148 | |
149 | test %eax, %eax |
150 | jz L(align64_loop) |
151 | |
152 | sub $64, %rdi |
153 | |
154 | pmovmskb %xmm0, %eax |
155 | test %eax, %eax |
156 | jnz L(matches) |
157 | |
158 | pmovmskb %xmm2, %eax |
159 | test %eax, %eax |
160 | jnz L(matches16) |
161 | |
162 | movdqa 32(%rdi), %xmm3 |
163 | pcmpeqb %xmm1, %xmm3 |
164 | |
165 | pcmpeqb 48(%rdi), %xmm1 |
166 | pmovmskb %xmm3, %eax |
167 | test %eax, %eax |
168 | jnz L(matches32) |
169 | |
170 | pmovmskb %xmm1, %eax |
171 | bsf %eax, %eax |
172 | lea 48(%rdi, %rax), %rax |
173 | ret |
174 | |
175 | .p2align 4 |
176 | L(matches0): |
177 | bsf %eax, %eax |
178 | lea -16(%rax, %rdi), %rax |
179 | ret |
180 | |
181 | .p2align 4 |
182 | L(matches): |
183 | bsf %eax, %eax |
184 | add %rdi, %rax |
185 | ret |
186 | |
187 | .p2align 4 |
188 | L(matches16): |
189 | bsf %eax, %eax |
190 | lea 16(%rax, %rdi), %rax |
191 | ret |
192 | |
193 | .p2align 4 |
194 | L(matches32): |
195 | bsf %eax, %eax |
196 | lea 32(%rax, %rdi), %rax |
197 | ret |
198 | |
199 | END (__rawmemchr) |
200 | |
201 | weak_alias (__rawmemchr, rawmemchr) |
202 | libc_hidden_builtin_def (__rawmemchr) |
203 | |