1 | /* strchr/strchrnul optimized with AVX2. |
2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRCHR |
24 | # define STRCHR __strchr_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WCSCHR |
28 | # define VPBROADCAST vpbroadcastd |
29 | # define VPCMPEQ vpcmpeqd |
30 | # define CHAR_REG esi |
31 | # else |
32 | # define VPBROADCAST vpbroadcastb |
33 | # define VPCMPEQ vpcmpeqb |
34 | # define CHAR_REG sil |
35 | # endif |
36 | |
37 | # ifndef VZEROUPPER |
38 | # define VZEROUPPER vzeroupper |
39 | # endif |
40 | |
41 | # define VEC_SIZE 32 |
42 | |
43 | .section .text.avx,"ax" ,@progbits |
44 | ENTRY (STRCHR) |
45 | movl %edi, %ecx |
46 | /* Broadcast CHAR to YMM0. */ |
47 | vmovd %esi, %xmm0 |
48 | vpxor %xmm9, %xmm9, %xmm9 |
49 | VPBROADCAST %xmm0, %ymm0 |
50 | /* Check if we may cross page boundary with one vector load. */ |
51 | andl $(2 * VEC_SIZE - 1), %ecx |
52 | cmpl $VEC_SIZE, %ecx |
53 | ja L(cros_page_boundary) |
54 | |
55 | /* Check the first VEC_SIZE bytes. Search for both CHAR and the |
56 | null byte. */ |
57 | vmovdqu (%rdi), %ymm8 |
58 | VPCMPEQ %ymm8, %ymm0, %ymm1 |
59 | VPCMPEQ %ymm8, %ymm9, %ymm2 |
60 | vpor %ymm1, %ymm2, %ymm1 |
61 | vpmovmskb %ymm1, %eax |
62 | testl %eax, %eax |
63 | jnz L(first_vec_x0) |
64 | |
65 | /* Align data for aligned loads in the loop. */ |
66 | addq $VEC_SIZE, %rdi |
67 | andl $(VEC_SIZE - 1), %ecx |
68 | andq $-VEC_SIZE, %rdi |
69 | |
70 | jmp L(more_4x_vec) |
71 | |
72 | .p2align 4 |
73 | L(cros_page_boundary): |
74 | andl $(VEC_SIZE - 1), %ecx |
75 | andq $-VEC_SIZE, %rdi |
76 | vmovdqu (%rdi), %ymm8 |
77 | VPCMPEQ %ymm8, %ymm0, %ymm1 |
78 | VPCMPEQ %ymm8, %ymm9, %ymm2 |
79 | vpor %ymm1, %ymm2, %ymm1 |
80 | vpmovmskb %ymm1, %eax |
81 | /* Remove the leading bytes. */ |
82 | sarl %cl, %eax |
83 | testl %eax, %eax |
84 | jz L(aligned_more) |
85 | /* Found CHAR or the null byte. */ |
86 | tzcntl %eax, %eax |
87 | addq %rcx, %rax |
88 | # ifdef USE_AS_STRCHRNUL |
89 | addq %rdi, %rax |
90 | # else |
91 | xorl %edx, %edx |
92 | leaq (%rdi, %rax), %rax |
93 | cmp (%rax), %CHAR_REG |
94 | cmovne %rdx, %rax |
95 | # endif |
96 | VZEROUPPER |
97 | ret |
98 | |
99 | .p2align 4 |
100 | L(aligned_more): |
101 | addq $VEC_SIZE, %rdi |
102 | |
103 | L(more_4x_vec): |
104 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
105 | since data is only aligned to VEC_SIZE. */ |
106 | vmovdqa (%rdi), %ymm8 |
107 | VPCMPEQ %ymm8, %ymm0, %ymm1 |
108 | VPCMPEQ %ymm8, %ymm9, %ymm2 |
109 | vpor %ymm1, %ymm2, %ymm1 |
110 | vpmovmskb %ymm1, %eax |
111 | testl %eax, %eax |
112 | jnz L(first_vec_x0) |
113 | |
114 | vmovdqa VEC_SIZE(%rdi), %ymm8 |
115 | VPCMPEQ %ymm8, %ymm0, %ymm1 |
116 | VPCMPEQ %ymm8, %ymm9, %ymm2 |
117 | vpor %ymm1, %ymm2, %ymm1 |
118 | vpmovmskb %ymm1, %eax |
119 | testl %eax, %eax |
120 | jnz L(first_vec_x1) |
121 | |
122 | vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 |
123 | VPCMPEQ %ymm8, %ymm0, %ymm1 |
124 | VPCMPEQ %ymm8, %ymm9, %ymm2 |
125 | vpor %ymm1, %ymm2, %ymm1 |
126 | vpmovmskb %ymm1, %eax |
127 | testl %eax, %eax |
128 | jnz L(first_vec_x2) |
129 | |
130 | vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 |
131 | VPCMPEQ %ymm8, %ymm0, %ymm1 |
132 | VPCMPEQ %ymm8, %ymm9, %ymm2 |
133 | vpor %ymm1, %ymm2, %ymm1 |
134 | vpmovmskb %ymm1, %eax |
135 | testl %eax, %eax |
136 | jnz L(first_vec_x3) |
137 | |
138 | addq $(VEC_SIZE * 4), %rdi |
139 | |
140 | /* Align data to 4 * VEC_SIZE. */ |
141 | movq %rdi, %rcx |
142 | andl $(4 * VEC_SIZE - 1), %ecx |
143 | andq $-(4 * VEC_SIZE), %rdi |
144 | |
145 | .p2align 4 |
146 | L(loop_4x_vec): |
147 | /* Compare 4 * VEC at a time forward. */ |
148 | vmovdqa (%rdi), %ymm5 |
149 | vmovdqa VEC_SIZE(%rdi), %ymm6 |
150 | vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 |
151 | vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 |
152 | |
153 | VPCMPEQ %ymm5, %ymm0, %ymm1 |
154 | VPCMPEQ %ymm6, %ymm0, %ymm2 |
155 | VPCMPEQ %ymm7, %ymm0, %ymm3 |
156 | VPCMPEQ %ymm8, %ymm0, %ymm4 |
157 | |
158 | VPCMPEQ %ymm5, %ymm9, %ymm5 |
159 | VPCMPEQ %ymm6, %ymm9, %ymm6 |
160 | VPCMPEQ %ymm7, %ymm9, %ymm7 |
161 | VPCMPEQ %ymm8, %ymm9, %ymm8 |
162 | |
163 | vpor %ymm1, %ymm5, %ymm1 |
164 | vpor %ymm2, %ymm6, %ymm2 |
165 | vpor %ymm3, %ymm7, %ymm3 |
166 | vpor %ymm4, %ymm8, %ymm4 |
167 | |
168 | vpor %ymm1, %ymm2, %ymm5 |
169 | vpor %ymm3, %ymm4, %ymm6 |
170 | |
171 | vpor %ymm5, %ymm6, %ymm5 |
172 | |
173 | vpmovmskb %ymm5, %eax |
174 | testl %eax, %eax |
175 | jnz L(4x_vec_end) |
176 | |
177 | addq $(VEC_SIZE * 4), %rdi |
178 | |
179 | jmp L(loop_4x_vec) |
180 | |
181 | .p2align 4 |
182 | L(first_vec_x0): |
183 | /* Found CHAR or the null byte. */ |
184 | tzcntl %eax, %eax |
185 | # ifdef USE_AS_STRCHRNUL |
186 | addq %rdi, %rax |
187 | # else |
188 | xorl %edx, %edx |
189 | leaq (%rdi, %rax), %rax |
190 | cmp (%rax), %CHAR_REG |
191 | cmovne %rdx, %rax |
192 | # endif |
193 | VZEROUPPER |
194 | ret |
195 | |
196 | .p2align 4 |
197 | L(first_vec_x1): |
198 | tzcntl %eax, %eax |
199 | # ifdef USE_AS_STRCHRNUL |
200 | addq $VEC_SIZE, %rax |
201 | addq %rdi, %rax |
202 | # else |
203 | xorl %edx, %edx |
204 | leaq VEC_SIZE(%rdi, %rax), %rax |
205 | cmp (%rax), %CHAR_REG |
206 | cmovne %rdx, %rax |
207 | # endif |
208 | VZEROUPPER |
209 | ret |
210 | |
211 | .p2align 4 |
212 | L(first_vec_x2): |
213 | tzcntl %eax, %eax |
214 | # ifdef USE_AS_STRCHRNUL |
215 | addq $(VEC_SIZE * 2), %rax |
216 | addq %rdi, %rax |
217 | # else |
218 | xorl %edx, %edx |
219 | leaq (VEC_SIZE * 2)(%rdi, %rax), %rax |
220 | cmp (%rax), %CHAR_REG |
221 | cmovne %rdx, %rax |
222 | # endif |
223 | VZEROUPPER |
224 | ret |
225 | |
226 | .p2align 4 |
227 | L(4x_vec_end): |
228 | vpmovmskb %ymm1, %eax |
229 | testl %eax, %eax |
230 | jnz L(first_vec_x0) |
231 | vpmovmskb %ymm2, %eax |
232 | testl %eax, %eax |
233 | jnz L(first_vec_x1) |
234 | vpmovmskb %ymm3, %eax |
235 | testl %eax, %eax |
236 | jnz L(first_vec_x2) |
237 | vpmovmskb %ymm4, %eax |
238 | testl %eax, %eax |
239 | L(first_vec_x3): |
240 | tzcntl %eax, %eax |
241 | # ifdef USE_AS_STRCHRNUL |
242 | addq $(VEC_SIZE * 3), %rax |
243 | addq %rdi, %rax |
244 | # else |
245 | xorl %edx, %edx |
246 | leaq (VEC_SIZE * 3)(%rdi, %rax), %rax |
247 | cmp (%rax), %CHAR_REG |
248 | cmovne %rdx, %rax |
249 | # endif |
250 | VZEROUPPER |
251 | ret |
252 | |
253 | END (STRCHR) |
254 | #endif |
255 | |