1 | /* strchr/strchrnul optimized with AVX2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef STRCHR |
26 | # define STRCHR __strchr_avx2 |
27 | # endif |
28 | |
29 | # ifdef USE_AS_WCSCHR |
30 | # define VPBROADCAST vpbroadcastd |
31 | # define VPCMPEQ vpcmpeqd |
32 | # define VPMINU vpminud |
33 | # define CHAR_REG esi |
34 | # else |
35 | # define VPBROADCAST vpbroadcastb |
36 | # define VPCMPEQ vpcmpeqb |
37 | # define VPMINU vpminub |
38 | # define CHAR_REG sil |
39 | # endif |
40 | |
41 | # ifndef VZEROUPPER |
42 | # define VZEROUPPER vzeroupper |
43 | # endif |
44 | |
45 | # ifndef SECTION |
46 | # define SECTION(p) p##.avx |
47 | # endif |
48 | |
49 | # define VEC_SIZE 32 |
50 | # define PAGE_SIZE 4096 |
51 | |
52 | .section SECTION(.text),"ax" ,@progbits |
53 | ENTRY_P2ALIGN (STRCHR, 5) |
54 | /* Broadcast CHAR to YMM0. */ |
55 | vmovd %esi, %xmm0 |
56 | movl %edi, %eax |
57 | andl $(PAGE_SIZE - 1), %eax |
58 | VPBROADCAST %xmm0, %ymm0 |
59 | vpxor %xmm1, %xmm1, %xmm1 |
60 | |
61 | /* Check if we cross page boundary with one vector load. */ |
62 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
63 | ja L(cross_page_boundary) |
64 | |
65 | /* Check the first VEC_SIZE bytes. Search for both CHAR and the |
66 | null byte. */ |
67 | vmovdqu (%rdi), %ymm2 |
68 | VPCMPEQ %ymm2, %ymm0, %ymm3 |
69 | VPCMPEQ %ymm2, %ymm1, %ymm2 |
70 | vpor %ymm3, %ymm2, %ymm3 |
71 | vpmovmskb %ymm3, %eax |
72 | testl %eax, %eax |
73 | jz L(aligned_more) |
74 | tzcntl %eax, %eax |
75 | # ifndef USE_AS_STRCHRNUL |
76 | /* Found CHAR or the null byte. */ |
77 | cmp (%rdi, %rax), %CHAR_REG |
78 | /* NB: Use a branch instead of cmovcc here. The expectation is |
79 | that with strchr the user will branch based on input being |
80 | null. Since this branch will be 100% predictive of the user |
81 | branch a branch miss here should save what otherwise would |
82 | be branch miss in the user code. Otherwise using a branch 1) |
83 | saves code size and 2) is faster in highly predictable |
84 | environments. */ |
85 | jne L(zero) |
86 | # endif |
87 | addq %rdi, %rax |
88 | L(return_vzeroupper): |
89 | ZERO_UPPER_VEC_REGISTERS_RETURN |
90 | |
91 | # ifndef USE_AS_STRCHRNUL |
92 | L(zero): |
93 | xorl %eax, %eax |
94 | VZEROUPPER_RETURN |
95 | # endif |
96 | |
97 | |
98 | .p2align 4 |
99 | L(first_vec_x1): |
100 | /* Use bsf to save code size. */ |
101 | bsfl %eax, %eax |
102 | incq %rdi |
103 | # ifndef USE_AS_STRCHRNUL |
104 | /* Found CHAR or the null byte. */ |
105 | cmp (%rdi, %rax), %CHAR_REG |
106 | jne L(zero) |
107 | # endif |
108 | addq %rdi, %rax |
109 | VZEROUPPER_RETURN |
110 | |
111 | .p2align 4,, 10 |
112 | L(first_vec_x2): |
113 | /* Use bsf to save code size. */ |
114 | bsfl %eax, %eax |
115 | addq $(VEC_SIZE + 1), %rdi |
116 | # ifndef USE_AS_STRCHRNUL |
117 | /* Found CHAR or the null byte. */ |
118 | cmp (%rdi, %rax), %CHAR_REG |
119 | jne L(zero) |
120 | # endif |
121 | addq %rdi, %rax |
122 | VZEROUPPER_RETURN |
123 | |
124 | .p2align 4,, 8 |
125 | L(first_vec_x3): |
126 | /* Use bsf to save code size. */ |
127 | bsfl %eax, %eax |
128 | addq $(VEC_SIZE * 2 + 1), %rdi |
129 | # ifndef USE_AS_STRCHRNUL |
130 | /* Found CHAR or the null byte. */ |
131 | cmp (%rdi, %rax), %CHAR_REG |
132 | jne L(zero) |
133 | # endif |
134 | addq %rdi, %rax |
135 | VZEROUPPER_RETURN |
136 | |
137 | .p2align 4,, 10 |
138 | L(first_vec_x4): |
139 | /* Use bsf to save code size. */ |
140 | bsfl %eax, %eax |
141 | addq $(VEC_SIZE * 3 + 1), %rdi |
142 | # ifndef USE_AS_STRCHRNUL |
143 | /* Found CHAR or the null byte. */ |
144 | cmp (%rdi, %rax), %CHAR_REG |
145 | jne L(zero) |
146 | # endif |
147 | addq %rdi, %rax |
148 | VZEROUPPER_RETURN |
149 | |
150 | |
151 | |
152 | .p2align 4 |
153 | L(aligned_more): |
154 | /* Align data to VEC_SIZE - 1. This is the same number of |
155 | instructions as using andq -VEC_SIZE but saves 4 bytes of code |
156 | on x4 check. */ |
157 | orq $(VEC_SIZE - 1), %rdi |
158 | L(cross_page_continue): |
159 | /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time |
160 | since data is only aligned to VEC_SIZE. */ |
161 | vmovdqa 1(%rdi), %ymm2 |
162 | VPCMPEQ %ymm2, %ymm0, %ymm3 |
163 | VPCMPEQ %ymm2, %ymm1, %ymm2 |
164 | vpor %ymm3, %ymm2, %ymm3 |
165 | vpmovmskb %ymm3, %eax |
166 | testl %eax, %eax |
167 | jnz L(first_vec_x1) |
168 | |
169 | vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 |
170 | VPCMPEQ %ymm2, %ymm0, %ymm3 |
171 | VPCMPEQ %ymm2, %ymm1, %ymm2 |
172 | vpor %ymm3, %ymm2, %ymm3 |
173 | vpmovmskb %ymm3, %eax |
174 | testl %eax, %eax |
175 | jnz L(first_vec_x2) |
176 | |
177 | vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 |
178 | VPCMPEQ %ymm2, %ymm0, %ymm3 |
179 | VPCMPEQ %ymm2, %ymm1, %ymm2 |
180 | vpor %ymm3, %ymm2, %ymm3 |
181 | vpmovmskb %ymm3, %eax |
182 | testl %eax, %eax |
183 | jnz L(first_vec_x3) |
184 | |
185 | vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 |
186 | VPCMPEQ %ymm2, %ymm0, %ymm3 |
187 | VPCMPEQ %ymm2, %ymm1, %ymm2 |
188 | vpor %ymm3, %ymm2, %ymm3 |
189 | vpmovmskb %ymm3, %eax |
190 | testl %eax, %eax |
191 | jnz L(first_vec_x4) |
192 | /* Align data to VEC_SIZE * 4 - 1. */ |
193 | incq %rdi |
194 | orq $(VEC_SIZE * 4 - 1), %rdi |
195 | .p2align 4 |
196 | L(loop_4x_vec): |
197 | /* Compare 4 * VEC at a time forward. */ |
198 | vmovdqa 1(%rdi), %ymm6 |
199 | vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 |
200 | |
201 | /* Leaves only CHARS matching esi as 0. */ |
202 | vpxor %ymm6, %ymm0, %ymm2 |
203 | vpxor %ymm7, %ymm0, %ymm3 |
204 | |
205 | VPMINU %ymm2, %ymm6, %ymm2 |
206 | VPMINU %ymm3, %ymm7, %ymm3 |
207 | |
208 | vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 |
209 | vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 |
210 | |
211 | vpxor %ymm6, %ymm0, %ymm4 |
212 | vpxor %ymm7, %ymm0, %ymm5 |
213 | |
214 | VPMINU %ymm4, %ymm6, %ymm4 |
215 | VPMINU %ymm5, %ymm7, %ymm5 |
216 | |
217 | VPMINU %ymm2, %ymm3, %ymm6 |
218 | VPMINU %ymm4, %ymm5, %ymm7 |
219 | |
220 | VPMINU %ymm6, %ymm7, %ymm7 |
221 | |
222 | VPCMPEQ %ymm7, %ymm1, %ymm7 |
223 | vpmovmskb %ymm7, %ecx |
224 | subq $-(VEC_SIZE * 4), %rdi |
225 | testl %ecx, %ecx |
226 | jz L(loop_4x_vec) |
227 | |
228 | VPCMPEQ %ymm2, %ymm1, %ymm2 |
229 | vpmovmskb %ymm2, %eax |
230 | testl %eax, %eax |
231 | jnz L(last_vec_x0) |
232 | |
233 | |
234 | VPCMPEQ %ymm3, %ymm1, %ymm3 |
235 | vpmovmskb %ymm3, %eax |
236 | testl %eax, %eax |
237 | jnz L(last_vec_x1) |
238 | |
239 | VPCMPEQ %ymm4, %ymm1, %ymm4 |
240 | vpmovmskb %ymm4, %eax |
241 | /* rcx has combined result from all 4 VEC. It will only be used |
242 | if the first 3 other VEC all did not contain a match. */ |
243 | salq $32, %rcx |
244 | orq %rcx, %rax |
245 | tzcntq %rax, %rax |
246 | subq $(VEC_SIZE * 2 - 1), %rdi |
247 | # ifndef USE_AS_STRCHRNUL |
248 | /* Found CHAR or the null byte. */ |
249 | cmp (%rdi, %rax), %CHAR_REG |
250 | jne L(zero_end) |
251 | # endif |
252 | addq %rdi, %rax |
253 | VZEROUPPER_RETURN |
254 | |
255 | |
256 | .p2align 4,, 10 |
257 | L(last_vec_x0): |
258 | /* Use bsf to save code size. */ |
259 | bsfl %eax, %eax |
260 | addq $-(VEC_SIZE * 4 - 1), %rdi |
261 | # ifndef USE_AS_STRCHRNUL |
262 | /* Found CHAR or the null byte. */ |
263 | cmp (%rdi, %rax), %CHAR_REG |
264 | jne L(zero_end) |
265 | # endif |
266 | addq %rdi, %rax |
267 | VZEROUPPER_RETURN |
268 | |
269 | |
270 | .p2align 4,, 10 |
271 | L(last_vec_x1): |
272 | tzcntl %eax, %eax |
273 | subq $(VEC_SIZE * 3 - 1), %rdi |
274 | # ifndef USE_AS_STRCHRNUL |
275 | /* Found CHAR or the null byte. */ |
276 | cmp (%rdi, %rax), %CHAR_REG |
277 | jne L(zero_end) |
278 | # endif |
279 | addq %rdi, %rax |
280 | VZEROUPPER_RETURN |
281 | |
282 | # ifndef USE_AS_STRCHRNUL |
283 | L(zero_end): |
284 | xorl %eax, %eax |
285 | VZEROUPPER_RETURN |
286 | # endif |
287 | |
288 | /* Cold case for crossing page with first load. */ |
289 | .p2align 4,, 8 |
290 | L(cross_page_boundary): |
291 | movq %rdi, %rdx |
292 | /* Align rdi to VEC_SIZE - 1. */ |
293 | orq $(VEC_SIZE - 1), %rdi |
294 | vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 |
295 | VPCMPEQ %ymm2, %ymm0, %ymm3 |
296 | VPCMPEQ %ymm2, %ymm1, %ymm2 |
297 | vpor %ymm3, %ymm2, %ymm3 |
298 | vpmovmskb %ymm3, %eax |
299 | /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT |
300 | so no need to manually mod edx. */ |
301 | sarxl %edx, %eax, %eax |
302 | testl %eax, %eax |
303 | jz L(cross_page_continue) |
304 | tzcntl %eax, %eax |
305 | # ifndef USE_AS_STRCHRNUL |
306 | xorl %ecx, %ecx |
307 | /* Found CHAR or the null byte. */ |
308 | cmp (%rdx, %rax), %CHAR_REG |
309 | jne L(zero_end) |
310 | # endif |
311 | addq %rdx, %rax |
312 | VZEROUPPER_RETURN |
313 | |
314 | END (STRCHR) |
315 | #endif |
316 | |