1 | /* memrchr optimized with AVX2. |
2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef VZEROUPPER |
24 | # define VZEROUPPER vzeroupper |
25 | # endif |
26 | |
27 | # define VEC_SIZE 32 |
28 | |
29 | .section .text.avx,"ax" ,@progbits |
30 | ENTRY (__memrchr_avx2) |
31 | /* Broadcast CHAR to YMM0. */ |
32 | vmovd %esi, %xmm0 |
33 | vpbroadcastb %xmm0, %ymm0 |
34 | |
35 | sub $VEC_SIZE, %RDX_LP |
36 | jbe L(last_vec_or_less) |
37 | |
38 | add %RDX_LP, %RDI_LP |
39 | |
40 | /* Check the last VEC_SIZE bytes. */ |
41 | vpcmpeqb (%rdi), %ymm0, %ymm1 |
42 | vpmovmskb %ymm1, %eax |
43 | testl %eax, %eax |
44 | jnz L(last_vec_x0) |
45 | |
46 | subq $(VEC_SIZE * 4), %rdi |
47 | movl %edi, %ecx |
48 | andl $(VEC_SIZE - 1), %ecx |
49 | jz L(aligned_more) |
50 | |
51 | /* Align data for aligned loads in the loop. */ |
52 | addq $VEC_SIZE, %rdi |
53 | addq $VEC_SIZE, %rdx |
54 | andq $-VEC_SIZE, %rdi |
55 | subq %rcx, %rdx |
56 | |
57 | .p2align 4 |
58 | L(aligned_more): |
59 | subq $(VEC_SIZE * 4), %rdx |
60 | jbe L(last_4x_vec_or_less) |
61 | |
62 | /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time |
63 | since data is only aligned to VEC_SIZE. */ |
64 | vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
65 | vpmovmskb %ymm1, %eax |
66 | testl %eax, %eax |
67 | jnz L(last_vec_x3) |
68 | |
69 | vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 |
70 | vpmovmskb %ymm2, %eax |
71 | testl %eax, %eax |
72 | jnz L(last_vec_x2) |
73 | |
74 | vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 |
75 | vpmovmskb %ymm3, %eax |
76 | testl %eax, %eax |
77 | jnz L(last_vec_x1) |
78 | |
79 | vpcmpeqb (%rdi), %ymm0, %ymm4 |
80 | vpmovmskb %ymm4, %eax |
81 | testl %eax, %eax |
82 | jnz L(last_vec_x0) |
83 | |
84 | /* Align data to 4 * VEC_SIZE for loop with fewer branches. |
85 | There are some overlaps with above if data isn't aligned |
86 | to 4 * VEC_SIZE. */ |
87 | movl %edi, %ecx |
88 | andl $(VEC_SIZE * 4 - 1), %ecx |
89 | jz L(loop_4x_vec) |
90 | |
91 | addq $(VEC_SIZE * 4), %rdi |
92 | addq $(VEC_SIZE * 4), %rdx |
93 | andq $-(VEC_SIZE * 4), %rdi |
94 | subq %rcx, %rdx |
95 | |
96 | .p2align 4 |
97 | L(loop_4x_vec): |
98 | /* Compare 4 * VEC at a time forward. */ |
99 | subq $(VEC_SIZE * 4), %rdi |
100 | subq $(VEC_SIZE * 4), %rdx |
101 | jbe L(last_4x_vec_or_less) |
102 | |
103 | vmovdqa (%rdi), %ymm1 |
104 | vmovdqa VEC_SIZE(%rdi), %ymm2 |
105 | vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 |
106 | vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 |
107 | |
108 | vpcmpeqb %ymm1, %ymm0, %ymm1 |
109 | vpcmpeqb %ymm2, %ymm0, %ymm2 |
110 | vpcmpeqb %ymm3, %ymm0, %ymm3 |
111 | vpcmpeqb %ymm4, %ymm0, %ymm4 |
112 | |
113 | vpor %ymm1, %ymm2, %ymm5 |
114 | vpor %ymm3, %ymm4, %ymm6 |
115 | vpor %ymm5, %ymm6, %ymm5 |
116 | |
117 | vpmovmskb %ymm5, %eax |
118 | testl %eax, %eax |
119 | jz L(loop_4x_vec) |
120 | |
121 | /* There is a match. */ |
122 | vpmovmskb %ymm4, %eax |
123 | testl %eax, %eax |
124 | jnz L(last_vec_x3) |
125 | |
126 | vpmovmskb %ymm3, %eax |
127 | testl %eax, %eax |
128 | jnz L(last_vec_x2) |
129 | |
130 | vpmovmskb %ymm2, %eax |
131 | testl %eax, %eax |
132 | jnz L(last_vec_x1) |
133 | |
134 | vpmovmskb %ymm1, %eax |
135 | bsrl %eax, %eax |
136 | addq %rdi, %rax |
137 | VZEROUPPER |
138 | ret |
139 | |
140 | .p2align 4 |
141 | L(last_4x_vec_or_less): |
142 | addl $(VEC_SIZE * 4), %edx |
143 | cmpl $(VEC_SIZE * 2), %edx |
144 | jbe L(last_2x_vec) |
145 | |
146 | vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
147 | vpmovmskb %ymm1, %eax |
148 | testl %eax, %eax |
149 | jnz L(last_vec_x3) |
150 | |
151 | vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 |
152 | vpmovmskb %ymm2, %eax |
153 | testl %eax, %eax |
154 | jnz L(last_vec_x2) |
155 | |
156 | vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 |
157 | vpmovmskb %ymm3, %eax |
158 | testl %eax, %eax |
159 | jnz L(last_vec_x1_check) |
160 | cmpl $(VEC_SIZE * 3), %edx |
161 | jbe L(zero) |
162 | |
163 | vpcmpeqb (%rdi), %ymm0, %ymm4 |
164 | vpmovmskb %ymm4, %eax |
165 | testl %eax, %eax |
166 | jz L(zero) |
167 | bsrl %eax, %eax |
168 | subq $(VEC_SIZE * 4), %rdx |
169 | addq %rax, %rdx |
170 | jl L(zero) |
171 | addq %rdi, %rax |
172 | VZEROUPPER |
173 | ret |
174 | |
175 | .p2align 4 |
176 | L(last_2x_vec): |
177 | vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
178 | vpmovmskb %ymm1, %eax |
179 | testl %eax, %eax |
180 | jnz L(last_vec_x3_check) |
181 | cmpl $VEC_SIZE, %edx |
182 | jbe L(zero) |
183 | |
184 | vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
185 | vpmovmskb %ymm1, %eax |
186 | testl %eax, %eax |
187 | jz L(zero) |
188 | bsrl %eax, %eax |
189 | subq $(VEC_SIZE * 2), %rdx |
190 | addq %rax, %rdx |
191 | jl L(zero) |
192 | addl $(VEC_SIZE * 2), %eax |
193 | addq %rdi, %rax |
194 | VZEROUPPER |
195 | ret |
196 | |
197 | .p2align 4 |
198 | L(last_vec_x0): |
199 | bsrl %eax, %eax |
200 | addq %rdi, %rax |
201 | VZEROUPPER |
202 | ret |
203 | |
204 | .p2align 4 |
205 | L(last_vec_x1): |
206 | bsrl %eax, %eax |
207 | addl $VEC_SIZE, %eax |
208 | addq %rdi, %rax |
209 | VZEROUPPER |
210 | ret |
211 | |
212 | .p2align 4 |
213 | L(last_vec_x2): |
214 | bsrl %eax, %eax |
215 | addl $(VEC_SIZE * 2), %eax |
216 | addq %rdi, %rax |
217 | VZEROUPPER |
218 | ret |
219 | |
220 | .p2align 4 |
221 | L(last_vec_x3): |
222 | bsrl %eax, %eax |
223 | addl $(VEC_SIZE * 3), %eax |
224 | addq %rdi, %rax |
225 | ret |
226 | |
227 | .p2align 4 |
228 | L(last_vec_x1_check): |
229 | bsrl %eax, %eax |
230 | subq $(VEC_SIZE * 3), %rdx |
231 | addq %rax, %rdx |
232 | jl L(zero) |
233 | addl $VEC_SIZE, %eax |
234 | addq %rdi, %rax |
235 | VZEROUPPER |
236 | ret |
237 | |
238 | .p2align 4 |
239 | L(last_vec_x3_check): |
240 | bsrl %eax, %eax |
241 | subq $VEC_SIZE, %rdx |
242 | addq %rax, %rdx |
243 | jl L(zero) |
244 | addl $(VEC_SIZE * 3), %eax |
245 | addq %rdi, %rax |
246 | VZEROUPPER |
247 | ret |
248 | |
249 | .p2align 4 |
250 | L(zero): |
251 | VZEROUPPER |
252 | L(null): |
253 | xorl %eax, %eax |
254 | ret |
255 | |
256 | .p2align 4 |
257 | L(last_vec_or_less_aligned): |
258 | movl %edx, %ecx |
259 | |
260 | vpcmpeqb (%rdi), %ymm0, %ymm1 |
261 | |
262 | movl $1, %edx |
263 | /* Support rdx << 32. */ |
264 | salq %cl, %rdx |
265 | subq $1, %rdx |
266 | |
267 | vpmovmskb %ymm1, %eax |
268 | |
269 | /* Remove the trailing bytes. */ |
270 | andl %edx, %eax |
271 | testl %eax, %eax |
272 | jz L(zero) |
273 | |
274 | bsrl %eax, %eax |
275 | addq %rdi, %rax |
276 | VZEROUPPER |
277 | ret |
278 | |
279 | .p2align 4 |
280 | L(last_vec_or_less): |
281 | addl $VEC_SIZE, %edx |
282 | |
283 | /* Check for zero length. */ |
284 | testl %edx, %edx |
285 | jz L(null) |
286 | |
287 | movl %edi, %ecx |
288 | andl $(VEC_SIZE - 1), %ecx |
289 | jz L(last_vec_or_less_aligned) |
290 | |
291 | movl %ecx, %esi |
292 | movl %ecx, %r8d |
293 | addl %edx, %esi |
294 | andq $-VEC_SIZE, %rdi |
295 | |
296 | subl $VEC_SIZE, %esi |
297 | ja L(last_vec_2x_aligned) |
298 | |
299 | /* Check the last VEC. */ |
300 | vpcmpeqb (%rdi), %ymm0, %ymm1 |
301 | vpmovmskb %ymm1, %eax |
302 | |
303 | /* Remove the leading and trailing bytes. */ |
304 | sarl %cl, %eax |
305 | movl %edx, %ecx |
306 | |
307 | movl $1, %edx |
308 | sall %cl, %edx |
309 | subl $1, %edx |
310 | |
311 | andl %edx, %eax |
312 | testl %eax, %eax |
313 | jz L(zero) |
314 | |
315 | bsrl %eax, %eax |
316 | addq %rdi, %rax |
317 | addq %r8, %rax |
318 | VZEROUPPER |
319 | ret |
320 | |
321 | .p2align 4 |
322 | L(last_vec_2x_aligned): |
323 | movl %esi, %ecx |
324 | |
325 | /* Check the last VEC. */ |
326 | vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 |
327 | |
328 | movl $1, %edx |
329 | sall %cl, %edx |
330 | subl $1, %edx |
331 | |
332 | vpmovmskb %ymm1, %eax |
333 | |
334 | /* Remove the trailing bytes. */ |
335 | andl %edx, %eax |
336 | |
337 | testl %eax, %eax |
338 | jnz L(last_vec_x1) |
339 | |
340 | /* Check the second last VEC. */ |
341 | vpcmpeqb (%rdi), %ymm0, %ymm1 |
342 | |
343 | movl %r8d, %ecx |
344 | |
345 | vpmovmskb %ymm1, %eax |
346 | |
347 | /* Remove the leading bytes. Must use unsigned right shift for |
348 | bsrl below. */ |
349 | shrl %cl, %eax |
350 | testl %eax, %eax |
351 | jz L(zero) |
352 | |
353 | bsrl %eax, %eax |
354 | addq %rdi, %rax |
355 | addq %r8, %rax |
356 | VZEROUPPER |
357 | ret |
358 | END (__memrchr_avx2) |
359 | #endif |
360 | |