1 | /* memchr/wmemchr optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | #include <sysdep.h> |
21 | |
22 | #if ISA_SHOULD_BUILD (4) |
23 | |
24 | # ifndef MEMCHR |
25 | # define MEMCHR __memchr_evex |
26 | # endif |
27 | |
28 | # ifdef USE_AS_WMEMCHR |
29 | # define VPBROADCAST vpbroadcastd |
30 | # define VPMINU vpminud |
31 | # define VPCMP vpcmpd |
32 | # define VPCMPEQ vpcmpeqd |
33 | # define CHAR_SIZE 4 |
34 | # else |
35 | # define VPBROADCAST vpbroadcastb |
36 | # define VPMINU vpminub |
37 | # define VPCMP vpcmpb |
38 | # define VPCMPEQ vpcmpeqb |
39 | # define CHAR_SIZE 1 |
40 | # endif |
41 | |
42 | /* In the 4x loop the RTM and non-RTM versions have data pointer |
43 | off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. |
44 | This is represented by BASE_OFFSET. As well because the RTM |
45 | version uses vpcmp which stores a bit per element compared where |
46 | the non-RTM version uses vpcmpeq which stores a bit per byte |
47 | compared RET_SCALE of CHAR_SIZE is only relevant for the RTM |
48 | version. */ |
49 | # ifdef USE_IN_RTM |
50 | # define VZEROUPPER |
51 | # define BASE_OFFSET (VEC_SIZE * 4) |
52 | # define RET_SCALE CHAR_SIZE |
53 | # else |
54 | # define VZEROUPPER vzeroupper |
55 | # define BASE_OFFSET 0 |
56 | # define RET_SCALE 1 |
57 | # endif |
58 | |
59 | /* In the return from 4x loop memchr and rawmemchr versions have |
60 | data pointers off by VEC_SIZE * 4 with memchr version being |
61 | VEC_SIZE * 4 greater. */ |
62 | # ifdef USE_AS_RAWMEMCHR |
63 | # define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) |
64 | # define RAW_PTR_REG rcx |
65 | # define ALGN_PTR_REG rdi |
66 | # else |
67 | # define RET_OFFSET BASE_OFFSET |
68 | # define RAW_PTR_REG rdi |
69 | # define ALGN_PTR_REG rcx |
70 | # endif |
71 | |
72 | # define XMMZERO xmm23 |
73 | # define YMMZERO ymm23 |
74 | # define XMMMATCH xmm16 |
75 | # define YMMMATCH ymm16 |
76 | # define YMM1 ymm17 |
77 | # define YMM2 ymm18 |
78 | # define YMM3 ymm19 |
79 | # define YMM4 ymm20 |
80 | # define YMM5 ymm21 |
81 | # define YMM6 ymm22 |
82 | |
83 | # ifndef SECTION |
84 | # define SECTION(p) p##.evex |
85 | # endif |
86 | |
87 | # define VEC_SIZE 32 |
88 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
89 | # define PAGE_SIZE 4096 |
90 | |
91 | .section SECTION(.text),"ax" ,@progbits |
92 | ENTRY_P2ALIGN (MEMCHR, 6) |
93 | # ifndef USE_AS_RAWMEMCHR |
94 | /* Check for zero length. */ |
95 | test %RDX_LP, %RDX_LP |
96 | jz L(zero) |
97 | |
98 | # ifdef __ILP32__ |
99 | /* Clear the upper 32 bits. */ |
100 | movl %edx, %edx |
101 | # endif |
102 | # endif |
103 | /* Broadcast CHAR to YMMMATCH. */ |
104 | VPBROADCAST %esi, %YMMMATCH |
105 | /* Check if we may cross page boundary with one vector load. */ |
106 | movl %edi, %eax |
107 | andl $(PAGE_SIZE - 1), %eax |
108 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
109 | ja L(cross_page_boundary) |
110 | |
111 | /* Check the first VEC_SIZE bytes. */ |
112 | VPCMP $0, (%rdi), %YMMMATCH, %k0 |
113 | kmovd %k0, %eax |
114 | # ifndef USE_AS_RAWMEMCHR |
115 | /* If length < CHAR_PER_VEC handle special. */ |
116 | cmpq $CHAR_PER_VEC, %rdx |
117 | jbe L(first_vec_x0) |
118 | # endif |
119 | testl %eax, %eax |
120 | jz L(aligned_more) |
121 | tzcntl %eax, %eax |
122 | # ifdef USE_AS_WMEMCHR |
123 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
124 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
125 | # else |
126 | addq %rdi, %rax |
127 | # endif |
128 | ret |
129 | |
130 | # ifndef USE_AS_RAWMEMCHR |
131 | L(zero): |
132 | xorl %eax, %eax |
133 | ret |
134 | |
135 | .p2align 4 |
136 | L(first_vec_x0): |
137 | /* Check if first match was before length. NB: tzcnt has false data- |
138 | dependency on destination. eax already had a data-dependency on esi |
139 | so this should have no affect here. */ |
140 | tzcntl %eax, %esi |
141 | # ifdef USE_AS_WMEMCHR |
142 | leaq (%rdi, %rsi, CHAR_SIZE), %rdi |
143 | # else |
144 | addq %rsi, %rdi |
145 | # endif |
146 | xorl %eax, %eax |
147 | cmpl %esi, %edx |
148 | cmovg %rdi, %rax |
149 | ret |
150 | # endif |
151 | |
152 | .p2align 4 |
153 | L(cross_page_boundary): |
154 | /* Save pointer before aligning as its original value is |
155 | necessary for computer return address if byte is found or |
156 | adjusting length if it is not and this is memchr. */ |
157 | movq %rdi, %rcx |
158 | /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi |
159 | for rawmemchr. */ |
160 | andq $-VEC_SIZE, %ALGN_PTR_REG |
161 | VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 |
162 | kmovd %k0, %r8d |
163 | # ifdef USE_AS_WMEMCHR |
164 | /* NB: Divide shift count by 4 since each bit in K0 represent 4 |
165 | bytes. */ |
166 | sarl $2, %eax |
167 | # endif |
168 | # ifndef USE_AS_RAWMEMCHR |
169 | movl $(PAGE_SIZE / CHAR_SIZE), %esi |
170 | subl %eax, %esi |
171 | # endif |
172 | # ifdef USE_AS_WMEMCHR |
173 | andl $(CHAR_PER_VEC - 1), %eax |
174 | # endif |
175 | /* Remove the leading bytes. */ |
176 | sarxl %eax, %r8d, %eax |
177 | # ifndef USE_AS_RAWMEMCHR |
178 | /* Check the end of data. */ |
179 | cmpq %rsi, %rdx |
180 | jbe L(first_vec_x0) |
181 | # endif |
182 | testl %eax, %eax |
183 | jz L(cross_page_continue) |
184 | tzcntl %eax, %eax |
185 | # ifdef USE_AS_WMEMCHR |
186 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
187 | leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax |
188 | # else |
189 | addq %RAW_PTR_REG, %rax |
190 | # endif |
191 | ret |
192 | |
193 | .p2align 4 |
194 | L(first_vec_x1): |
195 | tzcntl %eax, %eax |
196 | leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax |
197 | ret |
198 | |
199 | .p2align 4 |
200 | L(first_vec_x2): |
201 | tzcntl %eax, %eax |
202 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
203 | ret |
204 | |
205 | .p2align 4 |
206 | L(first_vec_x3): |
207 | tzcntl %eax, %eax |
208 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
209 | ret |
210 | |
211 | .p2align 4 |
212 | L(first_vec_x4): |
213 | tzcntl %eax, %eax |
214 | leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax |
215 | ret |
216 | |
217 | .p2align 5 |
218 | L(aligned_more): |
219 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
220 | since data is only aligned to VEC_SIZE. */ |
221 | |
222 | # ifndef USE_AS_RAWMEMCHR |
223 | /* Align data to VEC_SIZE. */ |
224 | L(cross_page_continue): |
225 | xorl %ecx, %ecx |
226 | subl %edi, %ecx |
227 | andq $-VEC_SIZE, %rdi |
228 | /* esi is for adjusting length to see if near the end. */ |
229 | leal (VEC_SIZE * 5)(%rdi, %rcx), %esi |
230 | # ifdef USE_AS_WMEMCHR |
231 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
232 | sarl $2, %esi |
233 | # endif |
234 | # else |
235 | andq $-VEC_SIZE, %rdi |
236 | L(cross_page_continue): |
237 | # endif |
238 | /* Load first VEC regardless. */ |
239 | VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 |
240 | kmovd %k0, %eax |
241 | # ifndef USE_AS_RAWMEMCHR |
242 | /* Adjust length. If near end handle specially. */ |
243 | subq %rsi, %rdx |
244 | jbe L(last_4x_vec_or_less) |
245 | # endif |
246 | testl %eax, %eax |
247 | jnz L(first_vec_x1) |
248 | |
249 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
250 | kmovd %k0, %eax |
251 | testl %eax, %eax |
252 | jnz L(first_vec_x2) |
253 | |
254 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 |
255 | kmovd %k0, %eax |
256 | testl %eax, %eax |
257 | jnz L(first_vec_x3) |
258 | |
259 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 |
260 | kmovd %k0, %eax |
261 | testl %eax, %eax |
262 | jnz L(first_vec_x4) |
263 | |
264 | |
265 | # ifndef USE_AS_RAWMEMCHR |
266 | /* Check if at last CHAR_PER_VEC * 4 length. */ |
267 | subq $(CHAR_PER_VEC * 4), %rdx |
268 | jbe L(last_4x_vec_or_less_cmpeq) |
269 | /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ |
270 | addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi |
271 | |
272 | /* Align data to VEC_SIZE * 4 for the loop and readjust length. |
273 | */ |
274 | # ifdef USE_AS_WMEMCHR |
275 | movl %edi, %ecx |
276 | andq $-(4 * VEC_SIZE), %rdi |
277 | subl %edi, %ecx |
278 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
279 | sarl $2, %ecx |
280 | addq %rcx, %rdx |
281 | # else |
282 | addq %rdi, %rdx |
283 | andq $-(4 * VEC_SIZE), %rdi |
284 | subq %rdi, %rdx |
285 | # endif |
286 | # else |
287 | addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi |
288 | andq $-(4 * VEC_SIZE), %rdi |
289 | # endif |
290 | # ifdef USE_IN_RTM |
291 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
292 | # else |
293 | /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not |
294 | encodable with EVEX registers (ymm16-ymm31). */ |
295 | vmovdqa64 %YMMMATCH, %ymm0 |
296 | # endif |
297 | |
298 | /* Compare 4 * VEC at a time forward. */ |
299 | .p2align 4 |
300 | L(loop_4x_vec): |
301 | /* Two versions of the loop. One that does not require |
302 | vzeroupper by not using ymm0-ymm15 and another does that require |
303 | vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 |
304 | is used at all is because there is no EVEX encoding vpcmpeq and |
305 | with vpcmpeq this loop can be performed more efficiently. The |
306 | non-vzeroupper version is safe for RTM while the vzeroupper |
307 | version should be prefered if RTM are not supported. */ |
308 | # ifdef USE_IN_RTM |
309 | /* It would be possible to save some instructions using 4x VPCMP |
310 | but bottleneck on port 5 makes it not woth it. */ |
311 | VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 |
312 | /* xor will set bytes match esi to zero. */ |
313 | vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 |
314 | vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 |
315 | VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 |
316 | /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ |
317 | VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} |
318 | VPCMP $0, %YMM3, %YMMZERO, %k2 |
319 | # else |
320 | /* Since vptern can only take 3x vectors fastest to do 1 vec |
321 | seperately with EVEX vpcmp. */ |
322 | # ifdef USE_AS_WMEMCHR |
323 | /* vptern can only accept masks for epi32/epi64 so can only save |
324 | instruction using not equals mask on vptern with wmemchr. */ |
325 | VPCMP $4, (%rdi), %YMMMATCH, %k1 |
326 | # else |
327 | VPCMP $0, (%rdi), %YMMMATCH, %k1 |
328 | # endif |
329 | /* Compare 3x with vpcmpeq and or them all together with vptern. |
330 | */ |
331 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 |
332 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 |
333 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 |
334 | # ifdef USE_AS_WMEMCHR |
335 | /* This takes the not of or between ymm2, ymm3, ymm4 as well as |
336 | combines result from VEC0 with zero mask. */ |
337 | vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} |
338 | vpmovmskb %ymm4, %ecx |
339 | # else |
340 | /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ |
341 | vpternlogd $254, %ymm2, %ymm3, %ymm4 |
342 | vpmovmskb %ymm4, %ecx |
343 | kmovd %k1, %eax |
344 | # endif |
345 | # endif |
346 | |
347 | # ifdef USE_AS_RAWMEMCHR |
348 | subq $-(VEC_SIZE * 4), %rdi |
349 | # endif |
350 | # ifdef USE_IN_RTM |
351 | kortestd %k2, %k3 |
352 | # else |
353 | # ifdef USE_AS_WMEMCHR |
354 | /* ecx contains not of matches. All 1s means no matches. incl will |
355 | overflow and set zeroflag if that is the case. */ |
356 | incl %ecx |
357 | # else |
358 | /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding |
359 | to ecx is not an issue because if eax is non-zero it will be |
360 | used for returning the match. If it is zero the add does |
361 | nothing. */ |
362 | addq %rax, %rcx |
363 | # endif |
364 | # endif |
365 | # ifdef USE_AS_RAWMEMCHR |
366 | jz L(loop_4x_vec) |
367 | # else |
368 | jnz L(loop_4x_vec_end) |
369 | |
370 | subq $-(VEC_SIZE * 4), %rdi |
371 | |
372 | subq $(CHAR_PER_VEC * 4), %rdx |
373 | ja L(loop_4x_vec) |
374 | |
375 | /* Fall through into less than 4 remaining vectors of length case. |
376 | */ |
377 | VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 |
378 | addq $(BASE_OFFSET - VEC_SIZE), %rdi |
379 | kmovd %k0, %eax |
380 | VZEROUPPER |
381 | |
382 | L(last_4x_vec_or_less): |
383 | /* Check if first VEC contained match. */ |
384 | testl %eax, %eax |
385 | jnz L(first_vec_x1_check) |
386 | |
387 | /* If remaining length > CHAR_PER_VEC * 2. */ |
388 | addl $(CHAR_PER_VEC * 2), %edx |
389 | jg L(last_4x_vec) |
390 | |
391 | L(last_2x_vec): |
392 | /* If remaining length < CHAR_PER_VEC. */ |
393 | addl $CHAR_PER_VEC, %edx |
394 | jle L(zero_end) |
395 | |
396 | /* Check VEC2 and compare any match with remaining length. */ |
397 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
398 | kmovd %k0, %eax |
399 | tzcntl %eax, %eax |
400 | cmpl %eax, %edx |
401 | jbe L(set_zero_end) |
402 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
403 | L(zero_end): |
404 | ret |
405 | |
406 | L(set_zero_end): |
407 | xorl %eax, %eax |
408 | ret |
409 | |
410 | .p2align 4 |
411 | L(first_vec_x1_check): |
412 | /* eax must be non-zero. Use bsfl to save code size. */ |
413 | bsfl %eax, %eax |
414 | /* Adjust length. */ |
415 | subl $-(CHAR_PER_VEC * 4), %edx |
416 | /* Check if match within remaining length. */ |
417 | cmpl %eax, %edx |
418 | jbe L(set_zero_end) |
419 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
420 | leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax |
421 | ret |
422 | |
423 | .p2align 4 |
424 | L(loop_4x_vec_end): |
425 | # endif |
426 | /* rawmemchr will fall through into this if match was found in |
427 | loop. */ |
428 | |
429 | # if defined USE_IN_RTM || defined USE_AS_WMEMCHR |
430 | /* k1 has not of matches with VEC1. */ |
431 | kmovd %k1, %eax |
432 | # ifdef USE_AS_WMEMCHR |
433 | subl $((1 << CHAR_PER_VEC) - 1), %eax |
434 | # else |
435 | incl %eax |
436 | # endif |
437 | # else |
438 | /* eax already has matches for VEC1. */ |
439 | testl %eax, %eax |
440 | # endif |
441 | jnz L(last_vec_x1_return) |
442 | |
443 | # ifdef USE_IN_RTM |
444 | VPCMP $0, %YMM2, %YMMZERO, %k0 |
445 | kmovd %k0, %eax |
446 | # else |
447 | vpmovmskb %ymm2, %eax |
448 | # endif |
449 | testl %eax, %eax |
450 | jnz L(last_vec_x2_return) |
451 | |
452 | # ifdef USE_IN_RTM |
453 | kmovd %k2, %eax |
454 | testl %eax, %eax |
455 | jnz L(last_vec_x3_return) |
456 | |
457 | kmovd %k3, %eax |
458 | tzcntl %eax, %eax |
459 | leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax |
460 | # else |
461 | vpmovmskb %ymm3, %eax |
462 | /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ |
463 | salq $VEC_SIZE, %rcx |
464 | orq %rcx, %rax |
465 | tzcntq %rax, %rax |
466 | leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax |
467 | VZEROUPPER |
468 | # endif |
469 | ret |
470 | |
471 | .p2align 4,, 10 |
472 | L(last_vec_x1_return): |
473 | tzcntl %eax, %eax |
474 | # if defined USE_AS_WMEMCHR || RET_OFFSET != 0 |
475 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
476 | leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax |
477 | # else |
478 | addq %rdi, %rax |
479 | # endif |
480 | VZEROUPPER |
481 | ret |
482 | |
483 | .p2align 4 |
484 | L(last_vec_x2_return): |
485 | tzcntl %eax, %eax |
486 | /* NB: Multiply bytes by RET_SCALE to get the wchar_t count |
487 | if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and |
488 | USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ |
489 | leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax |
490 | VZEROUPPER |
491 | ret |
492 | |
493 | # ifdef USE_IN_RTM |
494 | .p2align 4 |
495 | L(last_vec_x3_return): |
496 | tzcntl %eax, %eax |
497 | /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
498 | leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax |
499 | ret |
500 | # endif |
501 | |
502 | # ifndef USE_AS_RAWMEMCHR |
503 | .p2align 4,, 5 |
504 | L(last_4x_vec_or_less_cmpeq): |
505 | VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 |
506 | kmovd %k0, %eax |
507 | subq $-(VEC_SIZE * 4), %rdi |
508 | /* Check first VEC regardless. */ |
509 | testl %eax, %eax |
510 | jnz L(first_vec_x1_check) |
511 | |
512 | /* If remaining length <= CHAR_PER_VEC * 2. */ |
513 | addl $(CHAR_PER_VEC * 2), %edx |
514 | jle L(last_2x_vec) |
515 | |
516 | .p2align 4 |
517 | L(last_4x_vec): |
518 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
519 | kmovd %k0, %eax |
520 | testl %eax, %eax |
521 | jnz L(last_vec_x2) |
522 | |
523 | |
524 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 |
525 | kmovd %k0, %eax |
526 | /* Create mask for possible matches within remaining length. */ |
527 | # ifdef USE_AS_WMEMCHR |
528 | movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx |
529 | bzhil %edx, %ecx, %ecx |
530 | # else |
531 | movq $-1, %rcx |
532 | bzhiq %rdx, %rcx, %rcx |
533 | # endif |
534 | /* Test matches in data against length match. */ |
535 | andl %ecx, %eax |
536 | jnz L(last_vec_x3) |
537 | |
538 | /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after |
539 | remaining length was found to be > CHAR_PER_VEC * 2. */ |
540 | subl $CHAR_PER_VEC, %edx |
541 | jbe L(zero_end2) |
542 | |
543 | |
544 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 |
545 | kmovd %k0, %eax |
546 | /* Shift remaining length mask for last VEC. */ |
547 | # ifdef USE_AS_WMEMCHR |
548 | shrl $CHAR_PER_VEC, %ecx |
549 | # else |
550 | shrq $CHAR_PER_VEC, %rcx |
551 | # endif |
552 | andl %ecx, %eax |
553 | jz L(zero_end2) |
554 | bsfl %eax, %eax |
555 | leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax |
556 | L(zero_end2): |
557 | ret |
558 | |
559 | L(last_vec_x2): |
560 | tzcntl %eax, %eax |
561 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
562 | ret |
563 | |
564 | .p2align 4 |
565 | L(last_vec_x3): |
566 | tzcntl %eax, %eax |
567 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
568 | ret |
569 | # endif |
570 | /* 7 bytes from next cache line. */ |
571 | END (MEMCHR) |
572 | #endif |
573 | |