1 | /* memrchr optimized with AVX2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef MEMRCHR |
26 | # define MEMRCHR __memrchr_avx2 |
27 | # endif |
28 | |
29 | # ifndef VZEROUPPER |
30 | # define VZEROUPPER vzeroupper |
31 | # endif |
32 | |
33 | # ifndef SECTION |
34 | # define SECTION(p) p##.avx |
35 | # endif |
36 | |
37 | # define VEC_SIZE 32 |
38 | # define PAGE_SIZE 4096 |
39 | .section SECTION(.text), "ax" , @progbits |
40 | ENTRY_P2ALIGN(MEMRCHR, 6) |
41 | # ifdef __ILP32__ |
42 | /* Clear upper bits. */ |
43 | and %RDX_LP, %RDX_LP |
44 | # else |
45 | test %RDX_LP, %RDX_LP |
46 | # endif |
47 | jz L(zero_0) |
48 | |
49 | vmovd %esi, %xmm0 |
50 | /* Get end pointer. Minus one for two reasons. 1) It is necessary for a |
51 | correct page cross check and 2) it correctly sets up end ptr to be |
52 | subtract by lzcnt aligned. */ |
53 | leaq -1(%rdx, %rdi), %rax |
54 | |
55 | vpbroadcastb %xmm0, %ymm0 |
56 | |
57 | /* Check if we can load 1x VEC without cross a page. */ |
58 | testl $(PAGE_SIZE - VEC_SIZE), %eax |
59 | jz L(page_cross) |
60 | |
61 | vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 |
62 | vpmovmskb %ymm1, %ecx |
63 | cmpq $VEC_SIZE, %rdx |
64 | ja L(more_1x_vec) |
65 | |
66 | L(ret_vec_x0_test): |
67 | /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which |
68 | will guarantee edx (len) is less than it. */ |
69 | lzcntl %ecx, %ecx |
70 | |
71 | /* Hoist vzeroupper (not great for RTM) to save code size. This allows |
72 | all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ |
73 | COND_VZEROUPPER |
74 | cmpl %ecx, %edx |
75 | jle L(zero_0) |
76 | subq %rcx, %rax |
77 | ret |
78 | |
79 | /* Fits in aligning bytes of first cache line. */ |
80 | L(zero_0): |
81 | xorl %eax, %eax |
82 | ret |
83 | |
84 | .p2align 4,, 9 |
85 | L(ret_vec_x0): |
86 | lzcntl %ecx, %ecx |
87 | subq %rcx, %rax |
88 | L(return_vzeroupper): |
89 | ZERO_UPPER_VEC_REGISTERS_RETURN |
90 | |
91 | .p2align 4,, 10 |
92 | L(more_1x_vec): |
93 | testl %ecx, %ecx |
94 | jnz L(ret_vec_x0) |
95 | |
96 | /* Align rax (string pointer). */ |
97 | andq $-VEC_SIZE, %rax |
98 | |
99 | /* Recompute remaining length after aligning. */ |
100 | movq %rax, %rdx |
101 | /* Need this comparison next no matter what. */ |
102 | vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 |
103 | subq %rdi, %rdx |
104 | decq %rax |
105 | vpmovmskb %ymm1, %ecx |
106 | /* Fall through for short (hotter than length). */ |
107 | cmpq $(VEC_SIZE * 2), %rdx |
108 | ja L(more_2x_vec) |
109 | L(last_2x_vec): |
110 | cmpl $VEC_SIZE, %edx |
111 | jbe L(ret_vec_x0_test) |
112 | |
113 | testl %ecx, %ecx |
114 | jnz L(ret_vec_x0) |
115 | |
116 | vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 |
117 | vpmovmskb %ymm1, %ecx |
118 | /* 64-bit lzcnt. This will naturally add 32 to position. */ |
119 | lzcntq %rcx, %rcx |
120 | COND_VZEROUPPER |
121 | cmpl %ecx, %edx |
122 | jle L(zero_0) |
123 | subq %rcx, %rax |
124 | ret |
125 | |
126 | |
127 | /* Inexpensive place to put this regarding code size / target alignments |
128 | / ICache NLP. Necessary for 2-byte encoding of jump to page cross |
129 | case which in turn is necessary for hot path (len <= VEC_SIZE) to fit |
130 | in first cache line. */ |
131 | L(page_cross): |
132 | movq %rax, %rsi |
133 | andq $-VEC_SIZE, %rsi |
134 | vpcmpeqb (%rsi), %ymm0, %ymm1 |
135 | vpmovmskb %ymm1, %ecx |
136 | /* Shift out negative alignment (because we are starting from endptr and |
137 | working backwards). */ |
138 | movl %eax, %r8d |
139 | /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ |
140 | notl %r8d |
141 | shlxl %r8d, %ecx, %ecx |
142 | cmpq %rdi, %rsi |
143 | ja L(more_1x_vec) |
144 | lzcntl %ecx, %ecx |
145 | COND_VZEROUPPER |
146 | cmpl %ecx, %edx |
147 | jle L(zero_0) |
148 | subq %rcx, %rax |
149 | ret |
150 | .p2align 4,, 11 |
151 | L(ret_vec_x1): |
152 | /* This will naturally add 32 to position. */ |
153 | lzcntq %rcx, %rcx |
154 | subq %rcx, %rax |
155 | VZEROUPPER_RETURN |
156 | .p2align 4,, 10 |
157 | L(more_2x_vec): |
158 | testl %ecx, %ecx |
159 | jnz L(ret_vec_x0) |
160 | |
161 | vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 |
162 | vpmovmskb %ymm1, %ecx |
163 | testl %ecx, %ecx |
164 | jnz L(ret_vec_x1) |
165 | |
166 | |
167 | /* Needed no matter what. */ |
168 | vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 |
169 | vpmovmskb %ymm1, %ecx |
170 | |
171 | subq $(VEC_SIZE * 4), %rdx |
172 | ja L(more_4x_vec) |
173 | |
174 | cmpl $(VEC_SIZE * -1), %edx |
175 | jle L(ret_vec_x2_test) |
176 | |
177 | L(last_vec): |
178 | testl %ecx, %ecx |
179 | jnz L(ret_vec_x2) |
180 | |
181 | /* Needed no matter what. */ |
182 | vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 |
183 | vpmovmskb %ymm1, %ecx |
184 | lzcntl %ecx, %ecx |
185 | subq $(VEC_SIZE * 3), %rax |
186 | COND_VZEROUPPER |
187 | subq %rcx, %rax |
188 | cmpq %rax, %rdi |
189 | ja L(zero_2) |
190 | ret |
191 | |
192 | /* First in aligning bytes. */ |
193 | L(zero_2): |
194 | xorl %eax, %eax |
195 | ret |
196 | |
197 | .p2align 4,, 4 |
198 | L(ret_vec_x2_test): |
199 | lzcntl %ecx, %ecx |
200 | subq $(VEC_SIZE * 2), %rax |
201 | COND_VZEROUPPER |
202 | subq %rcx, %rax |
203 | cmpq %rax, %rdi |
204 | ja L(zero_2) |
205 | ret |
206 | |
207 | |
208 | .p2align 4,, 11 |
209 | L(ret_vec_x2): |
210 | /* ecx must be non-zero. */ |
211 | bsrl %ecx, %ecx |
212 | leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax |
213 | VZEROUPPER_RETURN |
214 | |
215 | .p2align 4,, 14 |
216 | L(ret_vec_x3): |
217 | /* ecx must be non-zero. */ |
218 | bsrl %ecx, %ecx |
219 | leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax |
220 | VZEROUPPER_RETURN |
221 | |
222 | |
223 | |
224 | .p2align 4 |
225 | L(more_4x_vec): |
226 | testl %ecx, %ecx |
227 | jnz L(ret_vec_x2) |
228 | |
229 | vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 |
230 | vpmovmskb %ymm1, %ecx |
231 | |
232 | testl %ecx, %ecx |
233 | jnz L(ret_vec_x3) |
234 | |
235 | /* Check if near end before re-aligning (otherwise might do an |
236 | unnecessary loop iteration). */ |
237 | addq $-(VEC_SIZE * 4), %rax |
238 | cmpq $(VEC_SIZE * 4), %rdx |
239 | jbe L(last_4x_vec) |
240 | |
241 | /* Align rax to (VEC_SIZE - 1). */ |
242 | orq $(VEC_SIZE * 4 - 1), %rax |
243 | movq %rdi, %rdx |
244 | /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because |
245 | lengths that overflow can be valid and break the comparison. */ |
246 | orq $(VEC_SIZE * 4 - 1), %rdx |
247 | |
248 | .p2align 4 |
249 | L(loop_4x_vec): |
250 | /* Need this comparison next no matter what. */ |
251 | vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 |
252 | vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 |
253 | vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 |
254 | vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 |
255 | |
256 | vpor %ymm1, %ymm2, %ymm2 |
257 | vpor %ymm3, %ymm4, %ymm4 |
258 | vpor %ymm2, %ymm4, %ymm4 |
259 | vpmovmskb %ymm4, %esi |
260 | |
261 | testl %esi, %esi |
262 | jnz L(loop_end) |
263 | |
264 | addq $(VEC_SIZE * -4), %rax |
265 | cmpq %rdx, %rax |
266 | jne L(loop_4x_vec) |
267 | |
268 | subl %edi, %edx |
269 | incl %edx |
270 | |
271 | L(last_4x_vec): |
272 | /* Used no matter what. */ |
273 | vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 |
274 | vpmovmskb %ymm1, %ecx |
275 | |
276 | cmpl $(VEC_SIZE * 2), %edx |
277 | jbe L(last_2x_vec) |
278 | |
279 | testl %ecx, %ecx |
280 | jnz L(ret_vec_x0_end) |
281 | |
282 | vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 |
283 | vpmovmskb %ymm1, %ecx |
284 | testl %ecx, %ecx |
285 | jnz L(ret_vec_x1_end) |
286 | |
287 | /* Used no matter what. */ |
288 | vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 |
289 | vpmovmskb %ymm1, %ecx |
290 | |
291 | cmpl $(VEC_SIZE * 3), %edx |
292 | ja L(last_vec) |
293 | |
294 | lzcntl %ecx, %ecx |
295 | subq $(VEC_SIZE * 2), %rax |
296 | COND_VZEROUPPER |
297 | subq %rcx, %rax |
298 | cmpq %rax, %rdi |
299 | jbe L(ret0) |
300 | xorl %eax, %eax |
301 | L(ret0): |
302 | ret |
303 | |
304 | |
305 | .p2align 4 |
306 | L(loop_end): |
307 | vpmovmskb %ymm1, %ecx |
308 | testl %ecx, %ecx |
309 | jnz L(ret_vec_x0_end) |
310 | |
311 | vpmovmskb %ymm2, %ecx |
312 | testl %ecx, %ecx |
313 | jnz L(ret_vec_x1_end) |
314 | |
315 | vpmovmskb %ymm3, %ecx |
316 | /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) |
317 | then it won't affect the result in esi (VEC4). If ecx is non-zero |
318 | then CHAR in VEC3 and bsrq will use that position. */ |
319 | salq $32, %rcx |
320 | orq %rsi, %rcx |
321 | bsrq %rcx, %rcx |
322 | leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax |
323 | VZEROUPPER_RETURN |
324 | |
325 | .p2align 4,, 4 |
326 | L(ret_vec_x1_end): |
327 | /* 64-bit version will automatically add 32 (VEC_SIZE). */ |
328 | lzcntq %rcx, %rcx |
329 | subq %rcx, %rax |
330 | VZEROUPPER_RETURN |
331 | |
332 | .p2align 4,, 4 |
333 | L(ret_vec_x0_end): |
334 | lzcntl %ecx, %ecx |
335 | subq %rcx, %rax |
336 | VZEROUPPER_RETURN |
337 | |
338 | /* 2 bytes until next cache line. */ |
339 | END(MEMRCHR) |
340 | #endif |
341 | |