1/* memrchr optimized with AVX2.
2 Copyright (C) 2017-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMRCHR
24# define MEMRCHR __memrchr_avx2
25# endif
26
27# ifndef VZEROUPPER
28# define VZEROUPPER vzeroupper
29# endif
30
31# ifndef SECTION
32# define SECTION(p) p##.avx
33# endif
34
35# define VEC_SIZE 32
36
37 .section SECTION(.text),"ax",@progbits
38ENTRY (MEMRCHR)
39 /* Broadcast CHAR to YMM0. */
40 vmovd %esi, %xmm0
41 vpbroadcastb %xmm0, %ymm0
42
43 sub $VEC_SIZE, %RDX_LP
44 jbe L(last_vec_or_less)
45
46 add %RDX_LP, %RDI_LP
47
48 /* Check the last VEC_SIZE bytes. */
49 vpcmpeqb (%rdi), %ymm0, %ymm1
50 vpmovmskb %ymm1, %eax
51 testl %eax, %eax
52 jnz L(last_vec_x0)
53
54 subq $(VEC_SIZE * 4), %rdi
55 movl %edi, %ecx
56 andl $(VEC_SIZE - 1), %ecx
57 jz L(aligned_more)
58
59 /* Align data for aligned loads in the loop. */
60 addq $VEC_SIZE, %rdi
61 addq $VEC_SIZE, %rdx
62 andq $-VEC_SIZE, %rdi
63 subq %rcx, %rdx
64
65 .p2align 4
66L(aligned_more):
67 subq $(VEC_SIZE * 4), %rdx
68 jbe L(last_4x_vec_or_less)
69
70 /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
71 since data is only aligned to VEC_SIZE. */
72 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
73 vpmovmskb %ymm1, %eax
74 testl %eax, %eax
75 jnz L(last_vec_x3)
76
77 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
78 vpmovmskb %ymm2, %eax
79 testl %eax, %eax
80 jnz L(last_vec_x2)
81
82 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
83 vpmovmskb %ymm3, %eax
84 testl %eax, %eax
85 jnz L(last_vec_x1)
86
87 vpcmpeqb (%rdi), %ymm0, %ymm4
88 vpmovmskb %ymm4, %eax
89 testl %eax, %eax
90 jnz L(last_vec_x0)
91
92 /* Align data to 4 * VEC_SIZE for loop with fewer branches.
93 There are some overlaps with above if data isn't aligned
94 to 4 * VEC_SIZE. */
95 movl %edi, %ecx
96 andl $(VEC_SIZE * 4 - 1), %ecx
97 jz L(loop_4x_vec)
98
99 addq $(VEC_SIZE * 4), %rdi
100 addq $(VEC_SIZE * 4), %rdx
101 andq $-(VEC_SIZE * 4), %rdi
102 subq %rcx, %rdx
103
104 .p2align 4
105L(loop_4x_vec):
106 /* Compare 4 * VEC at a time forward. */
107 subq $(VEC_SIZE * 4), %rdi
108 subq $(VEC_SIZE * 4), %rdx
109 jbe L(last_4x_vec_or_less)
110
111 vmovdqa (%rdi), %ymm1
112 vmovdqa VEC_SIZE(%rdi), %ymm2
113 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
114 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
115
116 vpcmpeqb %ymm1, %ymm0, %ymm1
117 vpcmpeqb %ymm2, %ymm0, %ymm2
118 vpcmpeqb %ymm3, %ymm0, %ymm3
119 vpcmpeqb %ymm4, %ymm0, %ymm4
120
121 vpor %ymm1, %ymm2, %ymm5
122 vpor %ymm3, %ymm4, %ymm6
123 vpor %ymm5, %ymm6, %ymm5
124
125 vpmovmskb %ymm5, %eax
126 testl %eax, %eax
127 jz L(loop_4x_vec)
128
129 /* There is a match. */
130 vpmovmskb %ymm4, %eax
131 testl %eax, %eax
132 jnz L(last_vec_x3)
133
134 vpmovmskb %ymm3, %eax
135 testl %eax, %eax
136 jnz L(last_vec_x2)
137
138 vpmovmskb %ymm2, %eax
139 testl %eax, %eax
140 jnz L(last_vec_x1)
141
142 vpmovmskb %ymm1, %eax
143 bsrl %eax, %eax
144 addq %rdi, %rax
145L(return_vzeroupper):
146 ZERO_UPPER_VEC_REGISTERS_RETURN
147
148 .p2align 4
149L(last_4x_vec_or_less):
150 addl $(VEC_SIZE * 4), %edx
151 cmpl $(VEC_SIZE * 2), %edx
152 jbe L(last_2x_vec)
153
154 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
155 vpmovmskb %ymm1, %eax
156 testl %eax, %eax
157 jnz L(last_vec_x3)
158
159 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
160 vpmovmskb %ymm2, %eax
161 testl %eax, %eax
162 jnz L(last_vec_x2)
163
164 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
165 vpmovmskb %ymm3, %eax
166 testl %eax, %eax
167 jnz L(last_vec_x1_check)
168 cmpl $(VEC_SIZE * 3), %edx
169 jbe L(zero)
170
171 vpcmpeqb (%rdi), %ymm0, %ymm4
172 vpmovmskb %ymm4, %eax
173 testl %eax, %eax
174 jz L(zero)
175 bsrl %eax, %eax
176 subq $(VEC_SIZE * 4), %rdx
177 addq %rax, %rdx
178 jl L(zero)
179 addq %rdi, %rax
180 VZEROUPPER_RETURN
181
182 .p2align 4
183L(last_2x_vec):
184 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
185 vpmovmskb %ymm1, %eax
186 testl %eax, %eax
187 jnz L(last_vec_x3_check)
188 cmpl $VEC_SIZE, %edx
189 jbe L(zero)
190
191 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
192 vpmovmskb %ymm1, %eax
193 testl %eax, %eax
194 jz L(zero)
195 bsrl %eax, %eax
196 subq $(VEC_SIZE * 2), %rdx
197 addq %rax, %rdx
198 jl L(zero)
199 addl $(VEC_SIZE * 2), %eax
200 addq %rdi, %rax
201 VZEROUPPER_RETURN
202
203 .p2align 4
204L(last_vec_x0):
205 bsrl %eax, %eax
206 addq %rdi, %rax
207 VZEROUPPER_RETURN
208
209 .p2align 4
210L(last_vec_x1):
211 bsrl %eax, %eax
212 addl $VEC_SIZE, %eax
213 addq %rdi, %rax
214 VZEROUPPER_RETURN
215
216 .p2align 4
217L(last_vec_x2):
218 bsrl %eax, %eax
219 addl $(VEC_SIZE * 2), %eax
220 addq %rdi, %rax
221 VZEROUPPER_RETURN
222
223 .p2align 4
224L(last_vec_x3):
225 bsrl %eax, %eax
226 addl $(VEC_SIZE * 3), %eax
227 addq %rdi, %rax
228 ret
229
230 .p2align 4
231L(last_vec_x1_check):
232 bsrl %eax, %eax
233 subq $(VEC_SIZE * 3), %rdx
234 addq %rax, %rdx
235 jl L(zero)
236 addl $VEC_SIZE, %eax
237 addq %rdi, %rax
238 VZEROUPPER_RETURN
239
240 .p2align 4
241L(last_vec_x3_check):
242 bsrl %eax, %eax
243 subq $VEC_SIZE, %rdx
244 addq %rax, %rdx
245 jl L(zero)
246 addl $(VEC_SIZE * 3), %eax
247 addq %rdi, %rax
248 VZEROUPPER_RETURN
249
250 .p2align 4
251L(zero):
252 xorl %eax, %eax
253 VZEROUPPER_RETURN
254
255 .p2align 4
256L(null):
257 xorl %eax, %eax
258 ret
259
260 .p2align 4
261L(last_vec_or_less_aligned):
262 movl %edx, %ecx
263
264 vpcmpeqb (%rdi), %ymm0, %ymm1
265
266 movl $1, %edx
267 /* Support rdx << 32. */
268 salq %cl, %rdx
269 subq $1, %rdx
270
271 vpmovmskb %ymm1, %eax
272
273 /* Remove the trailing bytes. */
274 andl %edx, %eax
275 testl %eax, %eax
276 jz L(zero)
277
278 bsrl %eax, %eax
279 addq %rdi, %rax
280 VZEROUPPER_RETURN
281
282 .p2align 4
283L(last_vec_or_less):
284 addl $VEC_SIZE, %edx
285
286 /* Check for zero length. */
287 testl %edx, %edx
288 jz L(null)
289
290 movl %edi, %ecx
291 andl $(VEC_SIZE - 1), %ecx
292 jz L(last_vec_or_less_aligned)
293
294 movl %ecx, %esi
295 movl %ecx, %r8d
296 addl %edx, %esi
297 andq $-VEC_SIZE, %rdi
298
299 subl $VEC_SIZE, %esi
300 ja L(last_vec_2x_aligned)
301
302 /* Check the last VEC. */
303 vpcmpeqb (%rdi), %ymm0, %ymm1
304 vpmovmskb %ymm1, %eax
305
306 /* Remove the leading and trailing bytes. */
307 sarl %cl, %eax
308 movl %edx, %ecx
309
310 movl $1, %edx
311 sall %cl, %edx
312 subl $1, %edx
313
314 andl %edx, %eax
315 testl %eax, %eax
316 jz L(zero)
317
318 bsrl %eax, %eax
319 addq %rdi, %rax
320 addq %r8, %rax
321 VZEROUPPER_RETURN
322
323 .p2align 4
324L(last_vec_2x_aligned):
325 movl %esi, %ecx
326
327 /* Check the last VEC. */
328 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
329
330 movl $1, %edx
331 sall %cl, %edx
332 subl $1, %edx
333
334 vpmovmskb %ymm1, %eax
335
336 /* Remove the trailing bytes. */
337 andl %edx, %eax
338
339 testl %eax, %eax
340 jnz L(last_vec_x1)
341
342 /* Check the second last VEC. */
343 vpcmpeqb (%rdi), %ymm0, %ymm1
344
345 movl %r8d, %ecx
346
347 vpmovmskb %ymm1, %eax
348
349 /* Remove the leading bytes. Must use unsigned right shift for
350 bsrl below. */
351 shrl %cl, %eax
352 testl %eax, %eax
353 jz L(zero)
354
355 bsrl %eax, %eax
356 addq %rdi, %rax
357 addq %r8, %rax
358 VZEROUPPER_RETURN
359END (MEMRCHR)
360#endif
361