1 | /* memchr/wmemchr optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef MEMCHR |
24 | # define MEMCHR __memchr_evex |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WMEMCHR |
28 | # define VPBROADCAST vpbroadcastd |
29 | # define VPMINU vpminud |
30 | # define VPCMP vpcmpd |
31 | # define VPCMPEQ vpcmpeqd |
32 | # define CHAR_SIZE 4 |
33 | # else |
34 | # define VPBROADCAST vpbroadcastb |
35 | # define VPMINU vpminub |
36 | # define VPCMP vpcmpb |
37 | # define VPCMPEQ vpcmpeqb |
38 | # define CHAR_SIZE 1 |
39 | # endif |
40 | |
41 | /* In the 4x loop the RTM and non-RTM versions have data pointer |
42 | off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. |
43 | This is represented by BASE_OFFSET. As well because the RTM |
44 | version uses vpcmp which stores a bit per element compared where |
45 | the non-RTM version uses vpcmpeq which stores a bit per byte |
46 | compared RET_SCALE of CHAR_SIZE is only relevant for the RTM |
47 | version. */ |
48 | # ifdef USE_IN_RTM |
49 | # define VZEROUPPER |
50 | # define BASE_OFFSET (VEC_SIZE * 4) |
51 | # define RET_SCALE CHAR_SIZE |
52 | # else |
53 | # define VZEROUPPER vzeroupper |
54 | # define BASE_OFFSET 0 |
55 | # define RET_SCALE 1 |
56 | # endif |
57 | |
58 | /* In the return from 4x loop memchr and rawmemchr versions have |
59 | data pointers off by VEC_SIZE * 4 with memchr version being |
60 | VEC_SIZE * 4 greater. */ |
61 | # ifdef USE_AS_RAWMEMCHR |
62 | # define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) |
63 | # define RAW_PTR_REG rcx |
64 | # define ALGN_PTR_REG rdi |
65 | # else |
66 | # define RET_OFFSET BASE_OFFSET |
67 | # define RAW_PTR_REG rdi |
68 | # define ALGN_PTR_REG rcx |
69 | # endif |
70 | |
71 | # define XMMZERO xmm23 |
72 | # define YMMZERO ymm23 |
73 | # define XMMMATCH xmm16 |
74 | # define YMMMATCH ymm16 |
75 | # define YMM1 ymm17 |
76 | # define YMM2 ymm18 |
77 | # define YMM3 ymm19 |
78 | # define YMM4 ymm20 |
79 | # define YMM5 ymm21 |
80 | # define YMM6 ymm22 |
81 | |
82 | # ifndef SECTION |
83 | # define SECTION(p) p##.evex |
84 | # endif |
85 | |
86 | # define VEC_SIZE 32 |
87 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
88 | # define PAGE_SIZE 4096 |
89 | |
90 | .section SECTION(.text),"ax" ,@progbits |
91 | ENTRY (MEMCHR) |
92 | # ifndef USE_AS_RAWMEMCHR |
93 | /* Check for zero length. */ |
94 | test %RDX_LP, %RDX_LP |
95 | jz L(zero) |
96 | |
97 | # ifdef __ILP32__ |
98 | /* Clear the upper 32 bits. */ |
99 | movl %edx, %edx |
100 | # endif |
101 | # endif |
102 | /* Broadcast CHAR to YMMMATCH. */ |
103 | VPBROADCAST %esi, %YMMMATCH |
104 | /* Check if we may cross page boundary with one vector load. */ |
105 | movl %edi, %eax |
106 | andl $(PAGE_SIZE - 1), %eax |
107 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
108 | ja L(cross_page_boundary) |
109 | |
110 | /* Check the first VEC_SIZE bytes. */ |
111 | VPCMP $0, (%rdi), %YMMMATCH, %k0 |
112 | kmovd %k0, %eax |
113 | # ifndef USE_AS_RAWMEMCHR |
114 | /* If length < CHAR_PER_VEC handle special. */ |
115 | cmpq $CHAR_PER_VEC, %rdx |
116 | jbe L(first_vec_x0) |
117 | # endif |
118 | testl %eax, %eax |
119 | jz L(aligned_more) |
120 | tzcntl %eax, %eax |
121 | # ifdef USE_AS_WMEMCHR |
122 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
123 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
124 | # else |
125 | addq %rdi, %rax |
126 | # endif |
127 | ret |
128 | |
129 | # ifndef USE_AS_RAWMEMCHR |
130 | L(zero): |
131 | xorl %eax, %eax |
132 | ret |
133 | |
134 | .p2align 5 |
135 | L(first_vec_x0): |
136 | /* Check if first match was before length. */ |
137 | tzcntl %eax, %eax |
138 | xorl %ecx, %ecx |
139 | cmpl %eax, %edx |
140 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
141 | cmovle %rcx, %rax |
142 | ret |
143 | # else |
144 | /* NB: first_vec_x0 is 17 bytes which will leave |
145 | cross_page_boundary (which is relatively cold) close enough |
146 | to ideal alignment. So only realign L(cross_page_boundary) if |
147 | rawmemchr. */ |
148 | .p2align 4 |
149 | # endif |
150 | L(cross_page_boundary): |
151 | /* Save pointer before aligning as its original value is |
152 | necessary for computer return address if byte is found or |
153 | adjusting length if it is not and this is memchr. */ |
154 | movq %rdi, %rcx |
155 | /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi |
156 | for rawmemchr. */ |
157 | andq $-VEC_SIZE, %ALGN_PTR_REG |
158 | VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 |
159 | kmovd %k0, %r8d |
160 | # ifdef USE_AS_WMEMCHR |
161 | /* NB: Divide shift count by 4 since each bit in K0 represent 4 |
162 | bytes. */ |
163 | sarl $2, %eax |
164 | # endif |
165 | # ifndef USE_AS_RAWMEMCHR |
166 | movl $(PAGE_SIZE / CHAR_SIZE), %esi |
167 | subl %eax, %esi |
168 | # endif |
169 | # ifdef USE_AS_WMEMCHR |
170 | andl $(CHAR_PER_VEC - 1), %eax |
171 | # endif |
172 | /* Remove the leading bytes. */ |
173 | sarxl %eax, %r8d, %eax |
174 | # ifndef USE_AS_RAWMEMCHR |
175 | /* Check the end of data. */ |
176 | cmpq %rsi, %rdx |
177 | jbe L(first_vec_x0) |
178 | # endif |
179 | testl %eax, %eax |
180 | jz L(cross_page_continue) |
181 | tzcntl %eax, %eax |
182 | # ifdef USE_AS_WMEMCHR |
183 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
184 | leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax |
185 | # else |
186 | addq %RAW_PTR_REG, %rax |
187 | # endif |
188 | ret |
189 | |
190 | .p2align 4 |
191 | L(first_vec_x1): |
192 | tzcntl %eax, %eax |
193 | leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax |
194 | ret |
195 | |
196 | .p2align 4 |
197 | L(first_vec_x2): |
198 | tzcntl %eax, %eax |
199 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
200 | ret |
201 | |
202 | .p2align 4 |
203 | L(first_vec_x3): |
204 | tzcntl %eax, %eax |
205 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
206 | ret |
207 | |
208 | .p2align 4 |
209 | L(first_vec_x4): |
210 | tzcntl %eax, %eax |
211 | leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax |
212 | ret |
213 | |
214 | .p2align 5 |
215 | L(aligned_more): |
216 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
217 | since data is only aligned to VEC_SIZE. */ |
218 | |
219 | # ifndef USE_AS_RAWMEMCHR |
220 | /* Align data to VEC_SIZE. */ |
221 | L(cross_page_continue): |
222 | xorl %ecx, %ecx |
223 | subl %edi, %ecx |
224 | andq $-VEC_SIZE, %rdi |
225 | /* esi is for adjusting length to see if near the end. */ |
226 | leal (VEC_SIZE * 5)(%rdi, %rcx), %esi |
227 | # ifdef USE_AS_WMEMCHR |
228 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
229 | sarl $2, %esi |
230 | # endif |
231 | # else |
232 | andq $-VEC_SIZE, %rdi |
233 | L(cross_page_continue): |
234 | # endif |
235 | /* Load first VEC regardless. */ |
236 | VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 |
237 | kmovd %k0, %eax |
238 | # ifndef USE_AS_RAWMEMCHR |
239 | /* Adjust length. If near end handle specially. */ |
240 | subq %rsi, %rdx |
241 | jbe L(last_4x_vec_or_less) |
242 | # endif |
243 | testl %eax, %eax |
244 | jnz L(first_vec_x1) |
245 | |
246 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
247 | kmovd %k0, %eax |
248 | testl %eax, %eax |
249 | jnz L(first_vec_x2) |
250 | |
251 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 |
252 | kmovd %k0, %eax |
253 | testl %eax, %eax |
254 | jnz L(first_vec_x3) |
255 | |
256 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 |
257 | kmovd %k0, %eax |
258 | testl %eax, %eax |
259 | jnz L(first_vec_x4) |
260 | |
261 | |
262 | # ifndef USE_AS_RAWMEMCHR |
263 | /* Check if at last CHAR_PER_VEC * 4 length. */ |
264 | subq $(CHAR_PER_VEC * 4), %rdx |
265 | jbe L(last_4x_vec_or_less_cmpeq) |
266 | /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ |
267 | addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi |
268 | |
269 | /* Align data to VEC_SIZE * 4 for the loop and readjust length. |
270 | */ |
271 | # ifdef USE_AS_WMEMCHR |
272 | movl %edi, %ecx |
273 | andq $-(4 * VEC_SIZE), %rdi |
274 | subl %edi, %ecx |
275 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
276 | sarl $2, %ecx |
277 | addq %rcx, %rdx |
278 | # else |
279 | addq %rdi, %rdx |
280 | andq $-(4 * VEC_SIZE), %rdi |
281 | subq %rdi, %rdx |
282 | # endif |
283 | # else |
284 | addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi |
285 | andq $-(4 * VEC_SIZE), %rdi |
286 | # endif |
287 | # ifdef USE_IN_RTM |
288 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
289 | # else |
290 | /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not |
291 | encodable with EVEX registers (ymm16-ymm31). */ |
292 | vmovdqa64 %YMMMATCH, %ymm0 |
293 | # endif |
294 | |
295 | /* Compare 4 * VEC at a time forward. */ |
296 | .p2align 4 |
297 | L(loop_4x_vec): |
298 | /* Two versions of the loop. One that does not require |
299 | vzeroupper by not using ymm0-ymm15 and another does that require |
300 | vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 |
301 | is used at all is because there is no EVEX encoding vpcmpeq and |
302 | with vpcmpeq this loop can be performed more efficiently. The |
303 | non-vzeroupper version is safe for RTM while the vzeroupper |
304 | version should be prefered if RTM are not supported. */ |
305 | # ifdef USE_IN_RTM |
306 | /* It would be possible to save some instructions using 4x VPCMP |
307 | but bottleneck on port 5 makes it not woth it. */ |
308 | VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 |
309 | /* xor will set bytes match esi to zero. */ |
310 | vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 |
311 | vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 |
312 | VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 |
313 | /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ |
314 | VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} |
315 | VPCMP $0, %YMM3, %YMMZERO, %k2 |
316 | # else |
317 | /* Since vptern can only take 3x vectors fastest to do 1 vec |
318 | seperately with EVEX vpcmp. */ |
319 | # ifdef USE_AS_WMEMCHR |
320 | /* vptern can only accept masks for epi32/epi64 so can only save |
321 | instruction using not equals mask on vptern with wmemchr. */ |
322 | VPCMP $4, (%rdi), %YMMMATCH, %k1 |
323 | # else |
324 | VPCMP $0, (%rdi), %YMMMATCH, %k1 |
325 | # endif |
326 | /* Compare 3x with vpcmpeq and or them all together with vptern. |
327 | */ |
328 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 |
329 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 |
330 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 |
331 | # ifdef USE_AS_WMEMCHR |
332 | /* This takes the not of or between ymm2, ymm3, ymm4 as well as |
333 | combines result from VEC0 with zero mask. */ |
334 | vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} |
335 | vpmovmskb %ymm4, %ecx |
336 | # else |
337 | /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ |
338 | vpternlogd $254, %ymm2, %ymm3, %ymm4 |
339 | vpmovmskb %ymm4, %ecx |
340 | kmovd %k1, %eax |
341 | # endif |
342 | # endif |
343 | |
344 | # ifdef USE_AS_RAWMEMCHR |
345 | subq $-(VEC_SIZE * 4), %rdi |
346 | # endif |
347 | # ifdef USE_IN_RTM |
348 | kortestd %k2, %k3 |
349 | # else |
350 | # ifdef USE_AS_WMEMCHR |
351 | /* ecx contains not of matches. All 1s means no matches. incl will |
352 | overflow and set zeroflag if that is the case. */ |
353 | incl %ecx |
354 | # else |
355 | /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding |
356 | to ecx is not an issue because if eax is non-zero it will be |
357 | used for returning the match. If it is zero the add does |
358 | nothing. */ |
359 | addq %rax, %rcx |
360 | # endif |
361 | # endif |
362 | # ifdef USE_AS_RAWMEMCHR |
363 | jz L(loop_4x_vec) |
364 | # else |
365 | jnz L(loop_4x_vec_end) |
366 | |
367 | subq $-(VEC_SIZE * 4), %rdi |
368 | |
369 | subq $(CHAR_PER_VEC * 4), %rdx |
370 | ja L(loop_4x_vec) |
371 | |
372 | /* Fall through into less than 4 remaining vectors of length case. |
373 | */ |
374 | VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 |
375 | addq $(BASE_OFFSET - VEC_SIZE), %rdi |
376 | kmovd %k0, %eax |
377 | VZEROUPPER |
378 | |
379 | L(last_4x_vec_or_less): |
380 | /* Check if first VEC contained match. */ |
381 | testl %eax, %eax |
382 | jnz L(first_vec_x1_check) |
383 | |
384 | /* If remaining length > CHAR_PER_VEC * 2. */ |
385 | addl $(CHAR_PER_VEC * 2), %edx |
386 | jg L(last_4x_vec) |
387 | |
388 | L(last_2x_vec): |
389 | /* If remaining length < CHAR_PER_VEC. */ |
390 | addl $CHAR_PER_VEC, %edx |
391 | jle L(zero_end) |
392 | |
393 | /* Check VEC2 and compare any match with remaining length. */ |
394 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
395 | kmovd %k0, %eax |
396 | tzcntl %eax, %eax |
397 | cmpl %eax, %edx |
398 | jbe L(set_zero_end) |
399 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
400 | L(zero_end): |
401 | ret |
402 | |
403 | |
404 | .p2align 4 |
405 | L(first_vec_x1_check): |
406 | tzcntl %eax, %eax |
407 | /* Adjust length. */ |
408 | subl $-(CHAR_PER_VEC * 4), %edx |
409 | /* Check if match within remaining length. */ |
410 | cmpl %eax, %edx |
411 | jbe L(set_zero_end) |
412 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
413 | leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax |
414 | ret |
415 | L(set_zero_end): |
416 | xorl %eax, %eax |
417 | ret |
418 | |
419 | .p2align 4 |
420 | L(loop_4x_vec_end): |
421 | # endif |
422 | /* rawmemchr will fall through into this if match was found in |
423 | loop. */ |
424 | |
425 | # if defined USE_IN_RTM || defined USE_AS_WMEMCHR |
426 | /* k1 has not of matches with VEC1. */ |
427 | kmovd %k1, %eax |
428 | # ifdef USE_AS_WMEMCHR |
429 | subl $((1 << CHAR_PER_VEC) - 1), %eax |
430 | # else |
431 | incl %eax |
432 | # endif |
433 | # else |
434 | /* eax already has matches for VEC1. */ |
435 | testl %eax, %eax |
436 | # endif |
437 | jnz L(last_vec_x1_return) |
438 | |
439 | # ifdef USE_IN_RTM |
440 | VPCMP $0, %YMM2, %YMMZERO, %k0 |
441 | kmovd %k0, %eax |
442 | # else |
443 | vpmovmskb %ymm2, %eax |
444 | # endif |
445 | testl %eax, %eax |
446 | jnz L(last_vec_x2_return) |
447 | |
448 | # ifdef USE_IN_RTM |
449 | kmovd %k2, %eax |
450 | testl %eax, %eax |
451 | jnz L(last_vec_x3_return) |
452 | |
453 | kmovd %k3, %eax |
454 | tzcntl %eax, %eax |
455 | leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax |
456 | # else |
457 | vpmovmskb %ymm3, %eax |
458 | /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ |
459 | salq $VEC_SIZE, %rcx |
460 | orq %rcx, %rax |
461 | tzcntq %rax, %rax |
462 | leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax |
463 | VZEROUPPER |
464 | # endif |
465 | ret |
466 | |
467 | .p2align 4 |
468 | L(last_vec_x1_return): |
469 | tzcntl %eax, %eax |
470 | # if defined USE_AS_WMEMCHR || RET_OFFSET != 0 |
471 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
472 | leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax |
473 | # else |
474 | addq %rdi, %rax |
475 | # endif |
476 | VZEROUPPER |
477 | ret |
478 | |
479 | .p2align 4 |
480 | L(last_vec_x2_return): |
481 | tzcntl %eax, %eax |
482 | /* NB: Multiply bytes by RET_SCALE to get the wchar_t count |
483 | if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and |
484 | USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ |
485 | leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax |
486 | VZEROUPPER |
487 | ret |
488 | |
489 | # ifdef USE_IN_RTM |
490 | .p2align 4 |
491 | L(last_vec_x3_return): |
492 | tzcntl %eax, %eax |
493 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
494 | leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax |
495 | ret |
496 | # endif |
497 | |
498 | # ifndef USE_AS_RAWMEMCHR |
499 | L(last_4x_vec_or_less_cmpeq): |
500 | VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 |
501 | kmovd %k0, %eax |
502 | subq $-(VEC_SIZE * 4), %rdi |
503 | /* Check first VEC regardless. */ |
504 | testl %eax, %eax |
505 | jnz L(first_vec_x1_check) |
506 | |
507 | /* If remaining length <= CHAR_PER_VEC * 2. */ |
508 | addl $(CHAR_PER_VEC * 2), %edx |
509 | jle L(last_2x_vec) |
510 | |
511 | .p2align 4 |
512 | L(last_4x_vec): |
513 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
514 | kmovd %k0, %eax |
515 | testl %eax, %eax |
516 | jnz L(last_vec_x2) |
517 | |
518 | |
519 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 |
520 | kmovd %k0, %eax |
521 | /* Create mask for possible matches within remaining length. */ |
522 | # ifdef USE_AS_WMEMCHR |
523 | movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx |
524 | bzhil %edx, %ecx, %ecx |
525 | # else |
526 | movq $-1, %rcx |
527 | bzhiq %rdx, %rcx, %rcx |
528 | # endif |
529 | /* Test matches in data against length match. */ |
530 | andl %ecx, %eax |
531 | jnz L(last_vec_x3) |
532 | |
533 | /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after |
534 | remaining length was found to be > CHAR_PER_VEC * 2. */ |
535 | subl $CHAR_PER_VEC, %edx |
536 | jbe L(zero_end2) |
537 | |
538 | |
539 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 |
540 | kmovd %k0, %eax |
541 | /* Shift remaining length mask for last VEC. */ |
542 | # ifdef USE_AS_WMEMCHR |
543 | shrl $CHAR_PER_VEC, %ecx |
544 | # else |
545 | shrq $CHAR_PER_VEC, %rcx |
546 | # endif |
547 | andl %ecx, %eax |
548 | jz L(zero_end2) |
549 | tzcntl %eax, %eax |
550 | leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax |
551 | L(zero_end2): |
552 | ret |
553 | |
554 | L(last_vec_x2): |
555 | tzcntl %eax, %eax |
556 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
557 | ret |
558 | |
559 | .p2align 4 |
560 | L(last_vec_x3): |
561 | tzcntl %eax, %eax |
562 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
563 | ret |
564 | # endif |
565 | |
566 | END (MEMCHR) |
567 | #endif |
568 | |