1 | /* strrchr/wcsrchr optimized with AVX2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef STRRCHR |
26 | # define STRRCHR __strrchr_avx2 |
27 | # endif |
28 | |
29 | # ifdef USE_AS_WCSRCHR |
30 | # define VPBROADCAST vpbroadcastd |
31 | # define VPCMPEQ vpcmpeqd |
32 | # define VPMIN vpminud |
33 | # define CHAR_SIZE 4 |
34 | # else |
35 | # define VPBROADCAST vpbroadcastb |
36 | # define VPCMPEQ vpcmpeqb |
37 | # define VPMIN vpminub |
38 | # define CHAR_SIZE 1 |
39 | # endif |
40 | |
41 | # ifndef VZEROUPPER |
42 | # define VZEROUPPER vzeroupper |
43 | # endif |
44 | |
45 | # ifndef SECTION |
46 | # define SECTION(p) p##.avx |
47 | # endif |
48 | |
49 | # define VEC_SIZE 32 |
50 | # define PAGE_SIZE 4096 |
51 | |
52 | .section SECTION(.text), "ax" , @progbits |
53 | ENTRY(STRRCHR) |
54 | vmovd %esi, %xmm7 |
55 | movl %edi, %eax |
56 | /* Broadcast CHAR to YMM4. */ |
57 | VPBROADCAST %xmm7, %ymm7 |
58 | vpxor %xmm0, %xmm0, %xmm0 |
59 | |
60 | /* Shift here instead of `andl` to save code size (saves a fetch |
61 | block). */ |
62 | sall $20, %eax |
63 | cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax |
64 | ja L(cross_page) |
65 | |
66 | L(page_cross_continue): |
67 | vmovdqu (%rdi), %ymm1 |
68 | /* Check end of string match. */ |
69 | VPCMPEQ %ymm1, %ymm0, %ymm6 |
70 | vpmovmskb %ymm6, %ecx |
71 | testl %ecx, %ecx |
72 | jz L(aligned_more) |
73 | |
74 | /* Only check match with search CHAR if needed. */ |
75 | VPCMPEQ %ymm1, %ymm7, %ymm1 |
76 | vpmovmskb %ymm1, %eax |
77 | /* Check if match before first zero. */ |
78 | blsmskl %ecx, %ecx |
79 | andl %ecx, %eax |
80 | jz L(ret0) |
81 | bsrl %eax, %eax |
82 | addq %rdi, %rax |
83 | /* We are off by 3 for wcsrchr if search CHAR is non-zero. If |
84 | search CHAR is zero we are correct. Either way `andq |
85 | -CHAR_SIZE, %rax` gets the correct result. */ |
86 | # ifdef USE_AS_WCSRCHR |
87 | andq $-CHAR_SIZE, %rax |
88 | # endif |
89 | L(ret0): |
90 | L(return_vzeroupper): |
91 | ZERO_UPPER_VEC_REGISTERS_RETURN |
92 | |
93 | /* Returns for first vec x1/x2 have hard coded backward search |
94 | path for earlier matches. */ |
95 | .p2align 4,, 10 |
96 | L(first_vec_x1): |
97 | VPCMPEQ %ymm2, %ymm7, %ymm6 |
98 | vpmovmskb %ymm6, %eax |
99 | blsmskl %ecx, %ecx |
100 | andl %ecx, %eax |
101 | jnz L(first_vec_x1_return) |
102 | |
103 | .p2align 4,, 4 |
104 | L(first_vec_x0_test): |
105 | VPCMPEQ %ymm1, %ymm7, %ymm6 |
106 | vpmovmskb %ymm6, %eax |
107 | testl %eax, %eax |
108 | jz L(ret1) |
109 | bsrl %eax, %eax |
110 | addq %r8, %rax |
111 | # ifdef USE_AS_WCSRCHR |
112 | andq $-CHAR_SIZE, %rax |
113 | # endif |
114 | L(ret1): |
115 | VZEROUPPER_RETURN |
116 | |
117 | .p2align 4,, 10 |
118 | L(first_vec_x0_x1_test): |
119 | VPCMPEQ %ymm2, %ymm7, %ymm6 |
120 | vpmovmskb %ymm6, %eax |
121 | /* Check ymm2 for search CHAR match. If no match then check ymm1 |
122 | before returning. */ |
123 | testl %eax, %eax |
124 | jz L(first_vec_x0_test) |
125 | .p2align 4,, 4 |
126 | L(first_vec_x1_return): |
127 | bsrl %eax, %eax |
128 | leaq 1(%rdi, %rax), %rax |
129 | # ifdef USE_AS_WCSRCHR |
130 | andq $-CHAR_SIZE, %rax |
131 | # endif |
132 | VZEROUPPER_RETURN |
133 | |
134 | |
135 | .p2align 4,, 10 |
136 | L(first_vec_x2): |
137 | VPCMPEQ %ymm3, %ymm7, %ymm6 |
138 | vpmovmskb %ymm6, %eax |
139 | blsmskl %ecx, %ecx |
140 | /* If no in-range search CHAR match in ymm3 then need to check |
141 | ymm1/ymm2 for an earlier match (we delay checking search |
142 | CHAR matches until needed). */ |
143 | andl %ecx, %eax |
144 | jz L(first_vec_x0_x1_test) |
145 | bsrl %eax, %eax |
146 | leaq (VEC_SIZE + 1)(%rdi, %rax), %rax |
147 | # ifdef USE_AS_WCSRCHR |
148 | andq $-CHAR_SIZE, %rax |
149 | # endif |
150 | VZEROUPPER_RETURN |
151 | |
152 | |
153 | .p2align 4 |
154 | L(aligned_more): |
155 | /* Save original pointer if match was in VEC 0. */ |
156 | movq %rdi, %r8 |
157 | |
158 | /* Align src. */ |
159 | orq $(VEC_SIZE - 1), %rdi |
160 | vmovdqu 1(%rdi), %ymm2 |
161 | VPCMPEQ %ymm2, %ymm0, %ymm6 |
162 | vpmovmskb %ymm6, %ecx |
163 | testl %ecx, %ecx |
164 | jnz L(first_vec_x1) |
165 | |
166 | vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 |
167 | VPCMPEQ %ymm3, %ymm0, %ymm6 |
168 | vpmovmskb %ymm6, %ecx |
169 | testl %ecx, %ecx |
170 | jnz L(first_vec_x2) |
171 | |
172 | /* Save pointer again before realigning. */ |
173 | movq %rdi, %rsi |
174 | addq $(VEC_SIZE + 1), %rdi |
175 | andq $-(VEC_SIZE * 2), %rdi |
176 | .p2align 4 |
177 | L(first_aligned_loop): |
178 | /* Do 2x VEC at a time. Any more and the cost of finding the |
179 | match outweights loop benefit. */ |
180 | vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 |
181 | vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 |
182 | |
183 | VPCMPEQ %ymm4, %ymm7, %ymm6 |
184 | VPMIN %ymm4, %ymm5, %ymm8 |
185 | VPCMPEQ %ymm5, %ymm7, %ymm10 |
186 | vpor %ymm6, %ymm10, %ymm5 |
187 | VPCMPEQ %ymm8, %ymm0, %ymm8 |
188 | vpor %ymm5, %ymm8, %ymm9 |
189 | |
190 | vpmovmskb %ymm9, %eax |
191 | addq $(VEC_SIZE * 2), %rdi |
192 | /* No zero or search CHAR. */ |
193 | testl %eax, %eax |
194 | jz L(first_aligned_loop) |
195 | |
196 | /* If no zero CHAR then go to second loop (this allows us to |
197 | throw away all prior work). */ |
198 | vpmovmskb %ymm8, %ecx |
199 | testl %ecx, %ecx |
200 | jz L(second_aligned_loop_prep) |
201 | |
202 | /* Search char could be zero so we need to get the true match. |
203 | */ |
204 | vpmovmskb %ymm5, %eax |
205 | testl %eax, %eax |
206 | jnz L(first_aligned_loop_return) |
207 | |
208 | .p2align 4,, 4 |
209 | L(first_vec_x1_or_x2): |
210 | VPCMPEQ %ymm3, %ymm7, %ymm3 |
211 | VPCMPEQ %ymm2, %ymm7, %ymm2 |
212 | vpmovmskb %ymm3, %eax |
213 | vpmovmskb %ymm2, %edx |
214 | /* Use add for macro-fusion. */ |
215 | addq %rax, %rdx |
216 | jz L(first_vec_x0_test) |
217 | /* NB: We could move this shift to before the branch and save a |
218 | bit of code size / performance on the fall through. The |
219 | branch leads to the null case which generally seems hotter |
220 | than char in first 3x VEC. */ |
221 | salq $32, %rax |
222 | addq %rdx, %rax |
223 | bsrq %rax, %rax |
224 | leaq 1(%rsi, %rax), %rax |
225 | # ifdef USE_AS_WCSRCHR |
226 | andq $-CHAR_SIZE, %rax |
227 | # endif |
228 | VZEROUPPER_RETURN |
229 | |
230 | .p2align 4,, 8 |
231 | L(first_aligned_loop_return): |
232 | VPCMPEQ %ymm4, %ymm0, %ymm4 |
233 | vpmovmskb %ymm4, %edx |
234 | salq $32, %rcx |
235 | orq %rdx, %rcx |
236 | |
237 | vpmovmskb %ymm10, %eax |
238 | vpmovmskb %ymm6, %edx |
239 | salq $32, %rax |
240 | orq %rdx, %rax |
241 | blsmskq %rcx, %rcx |
242 | andq %rcx, %rax |
243 | jz L(first_vec_x1_or_x2) |
244 | |
245 | bsrq %rax, %rax |
246 | leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax |
247 | # ifdef USE_AS_WCSRCHR |
248 | andq $-CHAR_SIZE, %rax |
249 | # endif |
250 | VZEROUPPER_RETURN |
251 | |
252 | /* Search char cannot be zero. */ |
253 | .p2align 4 |
254 | L(second_aligned_loop_set_furthest_match): |
255 | /* Save VEC and pointer from most recent match. */ |
256 | L(second_aligned_loop_prep): |
257 | movq %rdi, %rsi |
258 | vmovdqu %ymm6, %ymm2 |
259 | vmovdqu %ymm10, %ymm3 |
260 | |
261 | .p2align 4 |
262 | L(second_aligned_loop): |
263 | /* Search 2x at at time. */ |
264 | vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 |
265 | vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 |
266 | |
267 | VPCMPEQ %ymm4, %ymm7, %ymm6 |
268 | VPMIN %ymm4, %ymm5, %ymm1 |
269 | VPCMPEQ %ymm5, %ymm7, %ymm10 |
270 | vpor %ymm6, %ymm10, %ymm5 |
271 | VPCMPEQ %ymm1, %ymm0, %ymm1 |
272 | vpor %ymm5, %ymm1, %ymm9 |
273 | |
274 | vpmovmskb %ymm9, %eax |
275 | addq $(VEC_SIZE * 2), %rdi |
276 | testl %eax, %eax |
277 | jz L(second_aligned_loop) |
278 | vpmovmskb %ymm1, %ecx |
279 | testl %ecx, %ecx |
280 | jz L(second_aligned_loop_set_furthest_match) |
281 | vpmovmskb %ymm5, %eax |
282 | testl %eax, %eax |
283 | jnz L(return_new_match) |
284 | |
285 | /* This is the hot patch. We know CHAR is inbounds and that |
286 | ymm3/ymm2 have latest match. */ |
287 | .p2align 4,, 4 |
288 | L(return_old_match): |
289 | vpmovmskb %ymm3, %eax |
290 | vpmovmskb %ymm2, %edx |
291 | salq $32, %rax |
292 | orq %rdx, %rax |
293 | bsrq %rax, %rax |
294 | /* Search char cannot be zero so safe to just use lea for |
295 | wcsrchr. */ |
296 | leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax |
297 | VZEROUPPER_RETURN |
298 | |
299 | /* Last iteration also potentially has a match. */ |
300 | .p2align 4,, 8 |
301 | L(return_new_match): |
302 | VPCMPEQ %ymm4, %ymm0, %ymm4 |
303 | vpmovmskb %ymm4, %edx |
304 | salq $32, %rcx |
305 | orq %rdx, %rcx |
306 | |
307 | vpmovmskb %ymm10, %eax |
308 | vpmovmskb %ymm6, %edx |
309 | salq $32, %rax |
310 | orq %rdx, %rax |
311 | blsmskq %rcx, %rcx |
312 | andq %rcx, %rax |
313 | jz L(return_old_match) |
314 | bsrq %rax, %rax |
315 | /* Search char cannot be zero so safe to just use lea for |
316 | wcsrchr. */ |
317 | leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax |
318 | VZEROUPPER_RETURN |
319 | |
320 | .p2align 4,, 4 |
321 | L(cross_page): |
322 | movq %rdi, %rsi |
323 | andq $-VEC_SIZE, %rsi |
324 | vmovdqu (%rsi), %ymm1 |
325 | VPCMPEQ %ymm1, %ymm0, %ymm6 |
326 | vpmovmskb %ymm6, %ecx |
327 | /* Shift out zero CHAR matches that are before the begining of |
328 | src (rdi). */ |
329 | shrxl %edi, %ecx, %ecx |
330 | testl %ecx, %ecx |
331 | jz L(page_cross_continue) |
332 | VPCMPEQ %ymm1, %ymm7, %ymm1 |
333 | vpmovmskb %ymm1, %eax |
334 | |
335 | /* Shift out search CHAR matches that are before the begining of |
336 | src (rdi). */ |
337 | shrxl %edi, %eax, %eax |
338 | blsmskl %ecx, %ecx |
339 | /* Check if any search CHAR match in range. */ |
340 | andl %ecx, %eax |
341 | jz L(ret2) |
342 | bsrl %eax, %eax |
343 | addq %rdi, %rax |
344 | # ifdef USE_AS_WCSRCHR |
345 | andq $-CHAR_SIZE, %rax |
346 | # endif |
347 | L(ret2): |
348 | VZEROUPPER_RETURN |
349 | END(STRRCHR) |
350 | #endif |
351 | |