1/* strrchr/wcsrchr optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRRCHR
24# define STRRCHR __strrchr_avx2
25# endif
26
27# ifdef USE_AS_WCSRCHR
28# define VPBROADCAST vpbroadcastd
29# define VPCMPEQ vpcmpeqd
30# else
31# define VPBROADCAST vpbroadcastb
32# define VPCMPEQ vpcmpeqb
33# endif
34
35# ifndef VZEROUPPER
36# define VZEROUPPER vzeroupper
37# endif
38
39# ifndef SECTION
40# define SECTION(p) p##.avx
41# endif
42
43# define VEC_SIZE 32
44
45 .section SECTION(.text),"ax",@progbits
46ENTRY (STRRCHR)
47 movd %esi, %xmm4
48 movl %edi, %ecx
49 /* Broadcast CHAR to YMM4. */
50 VPBROADCAST %xmm4, %ymm4
51 vpxor %xmm0, %xmm0, %xmm0
52
53 /* Check if we may cross page boundary with one vector load. */
54 andl $(2 * VEC_SIZE - 1), %ecx
55 cmpl $VEC_SIZE, %ecx
56 ja L(cros_page_boundary)
57
58 vmovdqu (%rdi), %ymm1
59 VPCMPEQ %ymm1, %ymm0, %ymm2
60 VPCMPEQ %ymm1, %ymm4, %ymm3
61 vpmovmskb %ymm2, %ecx
62 vpmovmskb %ymm3, %eax
63 addq $VEC_SIZE, %rdi
64
65 testl %eax, %eax
66 jnz L(first_vec)
67
68 testl %ecx, %ecx
69 jnz L(return_null)
70
71 andq $-VEC_SIZE, %rdi
72 xorl %edx, %edx
73 jmp L(aligned_loop)
74
75 .p2align 4
76L(first_vec):
77 /* Check if there is a nul CHAR. */
78 testl %ecx, %ecx
79 jnz L(char_and_nul_in_first_vec)
80
81 /* Remember the match and keep searching. */
82 movl %eax, %edx
83 movq %rdi, %rsi
84 andq $-VEC_SIZE, %rdi
85 jmp L(aligned_loop)
86
87 .p2align 4
88L(cros_page_boundary):
89 andl $(VEC_SIZE - 1), %ecx
90 andq $-VEC_SIZE, %rdi
91 vmovdqa (%rdi), %ymm1
92 VPCMPEQ %ymm1, %ymm0, %ymm2
93 VPCMPEQ %ymm1, %ymm4, %ymm3
94 vpmovmskb %ymm2, %edx
95 vpmovmskb %ymm3, %eax
96 shrl %cl, %edx
97 shrl %cl, %eax
98 addq $VEC_SIZE, %rdi
99
100 /* Check if there is a CHAR. */
101 testl %eax, %eax
102 jnz L(found_char)
103
104 testl %edx, %edx
105 jnz L(return_null)
106
107 jmp L(aligned_loop)
108
109 .p2align 4
110L(found_char):
111 testl %edx, %edx
112 jnz L(char_and_nul)
113
114 /* Remember the match and keep searching. */
115 movl %eax, %edx
116 leaq (%rdi, %rcx), %rsi
117
118 .p2align 4
119L(aligned_loop):
120 vmovdqa (%rdi), %ymm1
121 VPCMPEQ %ymm1, %ymm0, %ymm2
122 addq $VEC_SIZE, %rdi
123 VPCMPEQ %ymm1, %ymm4, %ymm3
124 vpmovmskb %ymm2, %ecx
125 vpmovmskb %ymm3, %eax
126 orl %eax, %ecx
127 jnz L(char_nor_null)
128
129 vmovdqa (%rdi), %ymm1
130 VPCMPEQ %ymm1, %ymm0, %ymm2
131 add $VEC_SIZE, %rdi
132 VPCMPEQ %ymm1, %ymm4, %ymm3
133 vpmovmskb %ymm2, %ecx
134 vpmovmskb %ymm3, %eax
135 orl %eax, %ecx
136 jnz L(char_nor_null)
137
138 vmovdqa (%rdi), %ymm1
139 VPCMPEQ %ymm1, %ymm0, %ymm2
140 addq $VEC_SIZE, %rdi
141 VPCMPEQ %ymm1, %ymm4, %ymm3
142 vpmovmskb %ymm2, %ecx
143 vpmovmskb %ymm3, %eax
144 orl %eax, %ecx
145 jnz L(char_nor_null)
146
147 vmovdqa (%rdi), %ymm1
148 VPCMPEQ %ymm1, %ymm0, %ymm2
149 addq $VEC_SIZE, %rdi
150 VPCMPEQ %ymm1, %ymm4, %ymm3
151 vpmovmskb %ymm2, %ecx
152 vpmovmskb %ymm3, %eax
153 orl %eax, %ecx
154 jz L(aligned_loop)
155
156 .p2align 4
157L(char_nor_null):
158 /* Find a CHAR or a nul CHAR in a loop. */
159 testl %eax, %eax
160 jnz L(match)
161L(return_value):
162 testl %edx, %edx
163 jz L(return_null)
164 movl %edx, %eax
165 movq %rsi, %rdi
166
167# ifdef USE_AS_WCSRCHR
168 /* Keep the first bit for each matching CHAR for bsr. */
169 andl $0x11111111, %eax
170# endif
171 bsrl %eax, %eax
172 leaq -VEC_SIZE(%rdi, %rax), %rax
173L(return_vzeroupper):
174 ZERO_UPPER_VEC_REGISTERS_RETURN
175
176 .p2align 4
177L(match):
178 /* Find a CHAR. Check if there is a nul CHAR. */
179 vpmovmskb %ymm2, %ecx
180 testl %ecx, %ecx
181 jnz L(find_nul)
182
183 /* Remember the match and keep searching. */
184 movl %eax, %edx
185 movq %rdi, %rsi
186 jmp L(aligned_loop)
187
188 .p2align 4
189L(find_nul):
190# ifdef USE_AS_WCSRCHR
191 /* Keep the first bit for each matching CHAR for bsr. */
192 andl $0x11111111, %ecx
193 andl $0x11111111, %eax
194# endif
195 /* Mask out any matching bits after the nul CHAR. */
196 movl %ecx, %r8d
197 subl $1, %r8d
198 xorl %ecx, %r8d
199 andl %r8d, %eax
200 testl %eax, %eax
201 /* If there is no CHAR here, return the remembered one. */
202 jz L(return_value)
203 bsrl %eax, %eax
204 leaq -VEC_SIZE(%rdi, %rax), %rax
205 VZEROUPPER_RETURN
206
207 .p2align 4
208L(char_and_nul):
209 /* Find both a CHAR and a nul CHAR. */
210 addq %rcx, %rdi
211 movl %edx, %ecx
212L(char_and_nul_in_first_vec):
213# ifdef USE_AS_WCSRCHR
214 /* Keep the first bit for each matching CHAR for bsr. */
215 andl $0x11111111, %ecx
216 andl $0x11111111, %eax
217# endif
218 /* Mask out any matching bits after the nul CHAR. */
219 movl %ecx, %r8d
220 subl $1, %r8d
221 xorl %ecx, %r8d
222 andl %r8d, %eax
223 testl %eax, %eax
224 /* Return null pointer if the nul CHAR comes first. */
225 jz L(return_null)
226 bsrl %eax, %eax
227 leaq -VEC_SIZE(%rdi, %rax), %rax
228 VZEROUPPER_RETURN
229
230 .p2align 4
231L(return_null):
232 xorl %eax, %eax
233 VZEROUPPER_RETURN
234
235END (STRRCHR)
236#endif
237