1/* strchr/strchrnul optimized with AVX2.
2 Copyright (C) 2017-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCHR
24# define STRCHR __strchr_avx2
25# endif
26
27# ifdef USE_AS_WCSCHR
28# define VPBROADCAST vpbroadcastd
29# define VPCMPEQ vpcmpeqd
30# define VPMINU vpminud
31# define CHAR_REG esi
32# else
33# define VPBROADCAST vpbroadcastb
34# define VPCMPEQ vpcmpeqb
35# define VPMINU vpminub
36# define CHAR_REG sil
37# endif
38
39# ifndef VZEROUPPER
40# define VZEROUPPER vzeroupper
41# endif
42
43# ifndef SECTION
44# define SECTION(p) p##.avx
45# endif
46
47# define VEC_SIZE 32
48# define PAGE_SIZE 4096
49
50 .section SECTION(.text),"ax",@progbits
51ENTRY (STRCHR)
52 /* Broadcast CHAR to YMM0. */
53 vmovd %esi, %xmm0
54 movl %edi, %eax
55 andl $(PAGE_SIZE - 1), %eax
56 VPBROADCAST %xmm0, %ymm0
57 vpxor %xmm9, %xmm9, %xmm9
58
59 /* Check if we cross page boundary with one vector load. */
60 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
61 ja L(cross_page_boundary)
62
63 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
64 null byte. */
65 vmovdqu (%rdi), %ymm8
66 VPCMPEQ %ymm8, %ymm0, %ymm1
67 VPCMPEQ %ymm8, %ymm9, %ymm2
68 vpor %ymm1, %ymm2, %ymm1
69 vpmovmskb %ymm1, %eax
70 testl %eax, %eax
71 jz L(aligned_more)
72 tzcntl %eax, %eax
73# ifndef USE_AS_STRCHRNUL
74 /* Found CHAR or the null byte. */
75 cmp (%rdi, %rax), %CHAR_REG
76 jne L(zero)
77# endif
78 addq %rdi, %rax
79 VZEROUPPER_RETURN
80
81 /* .p2align 5 helps keep performance more consistent if ENTRY()
82 alignment % 32 was either 16 or 0. As well this makes the
83 alignment % 32 of the loop_4x_vec fixed which makes tuning it
84 easier. */
85 .p2align 5
86L(first_vec_x4):
87 tzcntl %eax, %eax
88 addq $(VEC_SIZE * 3 + 1), %rdi
89# ifndef USE_AS_STRCHRNUL
90 /* Found CHAR or the null byte. */
91 cmp (%rdi, %rax), %CHAR_REG
92 jne L(zero)
93# endif
94 addq %rdi, %rax
95 VZEROUPPER_RETURN
96
97# ifndef USE_AS_STRCHRNUL
98L(zero):
99 xorl %eax, %eax
100 VZEROUPPER_RETURN
101# endif
102
103
104 .p2align 4
105L(first_vec_x1):
106 tzcntl %eax, %eax
107 incq %rdi
108# ifndef USE_AS_STRCHRNUL
109 /* Found CHAR or the null byte. */
110 cmp (%rdi, %rax), %CHAR_REG
111 jne L(zero)
112# endif
113 addq %rdi, %rax
114 VZEROUPPER_RETURN
115
116 .p2align 4
117L(first_vec_x2):
118 tzcntl %eax, %eax
119 addq $(VEC_SIZE + 1), %rdi
120# ifndef USE_AS_STRCHRNUL
121 /* Found CHAR or the null byte. */
122 cmp (%rdi, %rax), %CHAR_REG
123 jne L(zero)
124# endif
125 addq %rdi, %rax
126 VZEROUPPER_RETURN
127
128 .p2align 4
129L(first_vec_x3):
130 tzcntl %eax, %eax
131 addq $(VEC_SIZE * 2 + 1), %rdi
132# ifndef USE_AS_STRCHRNUL
133 /* Found CHAR or the null byte. */
134 cmp (%rdi, %rax), %CHAR_REG
135 jne L(zero)
136# endif
137 addq %rdi, %rax
138 VZEROUPPER_RETURN
139
140 .p2align 4
141L(aligned_more):
142 /* Align data to VEC_SIZE - 1. This is the same number of
143 instructions as using andq -VEC_SIZE but saves 4 bytes of code
144 on x4 check. */
145 orq $(VEC_SIZE - 1), %rdi
146L(cross_page_continue):
147 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
148 since data is only aligned to VEC_SIZE. */
149 vmovdqa 1(%rdi), %ymm8
150 VPCMPEQ %ymm8, %ymm0, %ymm1
151 VPCMPEQ %ymm8, %ymm9, %ymm2
152 vpor %ymm1, %ymm2, %ymm1
153 vpmovmskb %ymm1, %eax
154 testl %eax, %eax
155 jnz L(first_vec_x1)
156
157 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
158 VPCMPEQ %ymm8, %ymm0, %ymm1
159 VPCMPEQ %ymm8, %ymm9, %ymm2
160 vpor %ymm1, %ymm2, %ymm1
161 vpmovmskb %ymm1, %eax
162 testl %eax, %eax
163 jnz L(first_vec_x2)
164
165 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
166 VPCMPEQ %ymm8, %ymm0, %ymm1
167 VPCMPEQ %ymm8, %ymm9, %ymm2
168 vpor %ymm1, %ymm2, %ymm1
169 vpmovmskb %ymm1, %eax
170 testl %eax, %eax
171 jnz L(first_vec_x3)
172
173 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
174 VPCMPEQ %ymm8, %ymm0, %ymm1
175 VPCMPEQ %ymm8, %ymm9, %ymm2
176 vpor %ymm1, %ymm2, %ymm1
177 vpmovmskb %ymm1, %eax
178 testl %eax, %eax
179 jnz L(first_vec_x4)
180 /* Align data to VEC_SIZE * 4 - 1. */
181 addq $(VEC_SIZE * 4 + 1), %rdi
182 andq $-(VEC_SIZE * 4), %rdi
183 .p2align 4
184L(loop_4x_vec):
185 /* Compare 4 * VEC at a time forward. */
186 vmovdqa (%rdi), %ymm5
187 vmovdqa (VEC_SIZE)(%rdi), %ymm6
188 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
189 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
190
191 /* Leaves only CHARS matching esi as 0. */
192 vpxor %ymm5, %ymm0, %ymm1
193 vpxor %ymm6, %ymm0, %ymm2
194 vpxor %ymm7, %ymm0, %ymm3
195 vpxor %ymm8, %ymm0, %ymm4
196
197 VPMINU %ymm1, %ymm5, %ymm1
198 VPMINU %ymm2, %ymm6, %ymm2
199 VPMINU %ymm3, %ymm7, %ymm3
200 VPMINU %ymm4, %ymm8, %ymm4
201
202 VPMINU %ymm1, %ymm2, %ymm5
203 VPMINU %ymm3, %ymm4, %ymm6
204
205 VPMINU %ymm5, %ymm6, %ymm6
206
207 VPCMPEQ %ymm6, %ymm9, %ymm6
208 vpmovmskb %ymm6, %ecx
209 subq $-(VEC_SIZE * 4), %rdi
210 testl %ecx, %ecx
211 jz L(loop_4x_vec)
212
213
214 VPCMPEQ %ymm1, %ymm9, %ymm1
215 vpmovmskb %ymm1, %eax
216 testl %eax, %eax
217 jnz L(last_vec_x0)
218
219
220 VPCMPEQ %ymm5, %ymm9, %ymm2
221 vpmovmskb %ymm2, %eax
222 testl %eax, %eax
223 jnz L(last_vec_x1)
224
225 VPCMPEQ %ymm3, %ymm9, %ymm3
226 vpmovmskb %ymm3, %eax
227 /* rcx has combined result from all 4 VEC. It will only be used
228 if the first 3 other VEC all did not contain a match. */
229 salq $32, %rcx
230 orq %rcx, %rax
231 tzcntq %rax, %rax
232 subq $(VEC_SIZE * 2), %rdi
233# ifndef USE_AS_STRCHRNUL
234 /* Found CHAR or the null byte. */
235 cmp (%rdi, %rax), %CHAR_REG
236 jne L(zero_end)
237# endif
238 addq %rdi, %rax
239 VZEROUPPER_RETURN
240
241
242 .p2align 4
243L(last_vec_x0):
244 tzcntl %eax, %eax
245 addq $-(VEC_SIZE * 4), %rdi
246# ifndef USE_AS_STRCHRNUL
247 /* Found CHAR or the null byte. */
248 cmp (%rdi, %rax), %CHAR_REG
249 jne L(zero_end)
250# endif
251 addq %rdi, %rax
252 VZEROUPPER_RETURN
253
254# ifndef USE_AS_STRCHRNUL
255L(zero_end):
256 xorl %eax, %eax
257 VZEROUPPER_RETURN
258# endif
259
260 .p2align 4
261L(last_vec_x1):
262 tzcntl %eax, %eax
263 subq $(VEC_SIZE * 3), %rdi
264# ifndef USE_AS_STRCHRNUL
265 /* Found CHAR or the null byte. */
266 cmp (%rdi, %rax), %CHAR_REG
267 jne L(zero_end)
268# endif
269 addq %rdi, %rax
270 VZEROUPPER_RETURN
271
272
273 /* Cold case for crossing page with first load. */
274 .p2align 4
275L(cross_page_boundary):
276 movq %rdi, %rdx
277 /* Align rdi to VEC_SIZE - 1. */
278 orq $(VEC_SIZE - 1), %rdi
279 vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
280 VPCMPEQ %ymm8, %ymm0, %ymm1
281 VPCMPEQ %ymm8, %ymm9, %ymm2
282 vpor %ymm1, %ymm2, %ymm1
283 vpmovmskb %ymm1, %eax
284 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
285 so no need to manually mod edx. */
286 sarxl %edx, %eax, %eax
287 testl %eax, %eax
288 jz L(cross_page_continue)
289 tzcntl %eax, %eax
290# ifndef USE_AS_STRCHRNUL
291 xorl %ecx, %ecx
292 /* Found CHAR or the null byte. */
293 cmp (%rdx, %rax), %CHAR_REG
294 leaq (%rdx, %rax), %rax
295 cmovne %rcx, %rax
296# else
297 addq %rdx, %rax
298# endif
299L(return_vzeroupper):
300 ZERO_UPPER_VEC_REGISTERS_RETURN
301
302END (STRCHR)
303# endif
304