1 | /* rawmemchr optimized with SSE2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | #include <sysdep.h> |
21 | |
22 | /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation |
23 | so we need this to build for ISA V2 builds. */ |
24 | #if ISA_SHOULD_BUILD (2) |
25 | |
26 | # ifndef RAWMEMCHR |
27 | # define RAWMEMCHR __rawmemchr_sse2 |
28 | # endif |
29 | |
30 | .text |
31 | ENTRY (RAWMEMCHR) |
32 | movd %rsi, %xmm1 |
33 | mov %rdi, %rcx |
34 | |
35 | punpcklbw %xmm1, %xmm1 |
36 | punpcklbw %xmm1, %xmm1 |
37 | |
38 | and $63, %rcx |
39 | pshufd $0, %xmm1, %xmm1 |
40 | |
41 | cmp $48, %rcx |
42 | ja L(crosscache) |
43 | |
44 | movdqu (%rdi), %xmm0 |
45 | pcmpeqb %xmm1, %xmm0 |
46 | /* Check if there is a match. */ |
47 | pmovmskb %xmm0, %eax |
48 | test %eax, %eax |
49 | |
50 | jnz L(matches) |
51 | add $16, %rdi |
52 | and $-16, %rdi |
53 | jmp L(loop_prolog) |
54 | |
55 | .p2align 4 |
56 | L(crosscache): |
57 | and $15, %rcx |
58 | and $-16, %rdi |
59 | movdqa (%rdi), %xmm0 |
60 | |
61 | pcmpeqb %xmm1, %xmm0 |
62 | /* Check if there is a match. */ |
63 | pmovmskb %xmm0, %eax |
64 | /* Remove the leading bytes. */ |
65 | sar %cl, %eax |
66 | test %eax, %eax |
67 | je L(unaligned_no_match) |
68 | /* Check which byte is a match. */ |
69 | bsf %eax, %eax |
70 | |
71 | add %rdi, %rax |
72 | add %rcx, %rax |
73 | ret |
74 | |
75 | .p2align 4 |
76 | L(unaligned_no_match): |
77 | add $16, %rdi |
78 | |
79 | .p2align 4 |
80 | L(loop_prolog): |
81 | movdqa (%rdi), %xmm0 |
82 | pcmpeqb %xmm1, %xmm0 |
83 | pmovmskb %xmm0, %eax |
84 | test %eax, %eax |
85 | jnz L(matches) |
86 | |
87 | movdqa 16(%rdi), %xmm2 |
88 | pcmpeqb %xmm1, %xmm2 |
89 | pmovmskb %xmm2, %eax |
90 | test %eax, %eax |
91 | jnz L(matches16) |
92 | |
93 | movdqa 32(%rdi), %xmm3 |
94 | pcmpeqb %xmm1, %xmm3 |
95 | pmovmskb %xmm3, %eax |
96 | test %eax, %eax |
97 | jnz L(matches32) |
98 | |
99 | movdqa 48(%rdi), %xmm4 |
100 | pcmpeqb %xmm1, %xmm4 |
101 | add $64, %rdi |
102 | pmovmskb %xmm4, %eax |
103 | test %eax, %eax |
104 | jnz L(matches0) |
105 | |
106 | test $0x3f, %rdi |
107 | jz L(align64_loop) |
108 | |
109 | movdqa (%rdi), %xmm0 |
110 | pcmpeqb %xmm1, %xmm0 |
111 | pmovmskb %xmm0, %eax |
112 | test %eax, %eax |
113 | jnz L(matches) |
114 | |
115 | movdqa 16(%rdi), %xmm2 |
116 | pcmpeqb %xmm1, %xmm2 |
117 | pmovmskb %xmm2, %eax |
118 | test %eax, %eax |
119 | jnz L(matches16) |
120 | |
121 | movdqa 32(%rdi), %xmm3 |
122 | pcmpeqb %xmm1, %xmm3 |
123 | pmovmskb %xmm3, %eax |
124 | test %eax, %eax |
125 | jnz L(matches32) |
126 | |
127 | movdqa 48(%rdi), %xmm3 |
128 | pcmpeqb %xmm1, %xmm3 |
129 | pmovmskb %xmm3, %eax |
130 | |
131 | add $64, %rdi |
132 | test %eax, %eax |
133 | jnz L(matches0) |
134 | |
135 | and $-64, %rdi |
136 | |
137 | .p2align 4 |
138 | L(align64_loop): |
139 | movdqa (%rdi), %xmm0 |
140 | movdqa 16(%rdi), %xmm2 |
141 | movdqa 32(%rdi), %xmm3 |
142 | movdqa 48(%rdi), %xmm4 |
143 | |
144 | pcmpeqb %xmm1, %xmm0 |
145 | pcmpeqb %xmm1, %xmm2 |
146 | pcmpeqb %xmm1, %xmm3 |
147 | pcmpeqb %xmm1, %xmm4 |
148 | |
149 | pmaxub %xmm0, %xmm3 |
150 | pmaxub %xmm2, %xmm4 |
151 | pmaxub %xmm3, %xmm4 |
152 | pmovmskb %xmm4, %eax |
153 | |
154 | add $64, %rdi |
155 | |
156 | test %eax, %eax |
157 | jz L(align64_loop) |
158 | |
159 | sub $64, %rdi |
160 | |
161 | pmovmskb %xmm0, %eax |
162 | test %eax, %eax |
163 | jnz L(matches) |
164 | |
165 | pmovmskb %xmm2, %eax |
166 | test %eax, %eax |
167 | jnz L(matches16) |
168 | |
169 | movdqa 32(%rdi), %xmm3 |
170 | pcmpeqb %xmm1, %xmm3 |
171 | |
172 | pcmpeqb 48(%rdi), %xmm1 |
173 | pmovmskb %xmm3, %eax |
174 | test %eax, %eax |
175 | jnz L(matches32) |
176 | |
177 | pmovmskb %xmm1, %eax |
178 | bsf %eax, %eax |
179 | lea 48(%rdi, %rax), %rax |
180 | ret |
181 | |
182 | .p2align 4 |
183 | L(matches0): |
184 | bsf %eax, %eax |
185 | lea -16(%rax, %rdi), %rax |
186 | ret |
187 | |
188 | .p2align 4 |
189 | L(matches): |
190 | bsf %eax, %eax |
191 | add %rdi, %rax |
192 | ret |
193 | |
194 | .p2align 4 |
195 | L(matches16): |
196 | bsf %eax, %eax |
197 | lea 16(%rax, %rdi), %rax |
198 | ret |
199 | |
200 | .p2align 4 |
201 | L(matches32): |
202 | bsf %eax, %eax |
203 | lea 32(%rax, %rdi), %rax |
204 | ret |
205 | |
206 | END (RAWMEMCHR) |
207 | #endif |
208 | |