1 | /* memchr/wmemchr optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | #include <sysdep.h> |
21 | |
22 | #if ISA_SHOULD_BUILD (4) |
23 | |
24 | # ifndef VEC_SIZE |
25 | # include "x86-evex256-vecs.h" |
26 | # endif |
27 | |
28 | # ifndef MEMCHR |
29 | # define MEMCHR __memchr_evex |
30 | # endif |
31 | |
32 | # ifdef USE_AS_WMEMCHR |
33 | # define PC_SHIFT_GPR rcx |
34 | # define VPTESTN vptestnmd |
35 | # define VPBROADCAST vpbroadcastd |
36 | # define VPMINU vpminud |
37 | # define VPCMP vpcmpd |
38 | # define VPCMPEQ vpcmpeqd |
39 | # define CHAR_SIZE 4 |
40 | |
41 | # define USE_WIDE_CHAR |
42 | # else |
43 | # define PC_SHIFT_GPR rdi |
44 | # define VPTESTN vptestnmb |
45 | # define VPBROADCAST vpbroadcastb |
46 | # define VPMINU vpminub |
47 | # define VPCMP vpcmpb |
48 | # define VPCMPEQ vpcmpeqb |
49 | # define CHAR_SIZE 1 |
50 | # endif |
51 | |
52 | # include "reg-macros.h" |
53 | |
54 | |
55 | /* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 |
56 | doesn't have VEX encoding), use VEX encoding in loop so we |
57 | can use vpcmpeqb + vptern which is more efficient than the |
58 | EVEX alternative. */ |
59 | # if defined USE_IN_RTM || VEC_SIZE == 64 |
60 | # undef COND_VZEROUPPER |
61 | # undef VZEROUPPER_RETURN |
62 | # undef VZEROUPPER |
63 | |
64 | # define COND_VZEROUPPER |
65 | # define VZEROUPPER_RETURN ret |
66 | # define VZEROUPPER |
67 | |
68 | # define USE_TERN_IN_LOOP 0 |
69 | # else |
70 | # define USE_TERN_IN_LOOP 1 |
71 | # undef VZEROUPPER |
72 | # define VZEROUPPER vzeroupper |
73 | # endif |
74 | |
75 | # if USE_TERN_IN_LOOP |
76 | /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar |
77 | so we don't want to multiply resulting index. */ |
78 | # define TERN_CHAR_MULT 1 |
79 | |
80 | # ifdef USE_AS_WMEMCHR |
81 | # define TEST_END() inc %VRCX |
82 | # else |
83 | # define TEST_END() add %rdx, %rcx |
84 | # endif |
85 | # else |
86 | # define TERN_CHAR_MULT CHAR_SIZE |
87 | # define TEST_END() KORTEST %k2, %k3 |
88 | # endif |
89 | |
90 | # if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP |
91 | # ifndef USE_AS_WMEMCHR |
92 | # define GPR_X0_IS_RET 1 |
93 | # else |
94 | # define GPR_X0_IS_RET 0 |
95 | # endif |
96 | # define GPR_X0 rax |
97 | # else |
98 | # define GPR_X0_IS_RET 0 |
99 | # define GPR_X0 rdx |
100 | # endif |
101 | |
102 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
103 | |
104 | # if CHAR_PER_VEC == 64 |
105 | # define LAST_VEC_OFFSET (VEC_SIZE * 3) |
106 | # else |
107 | # define LAST_VEC_OFFSET (VEC_SIZE * 2) |
108 | # endif |
109 | # if CHAR_PER_VEC >= 32 |
110 | # define MASK_GPR(...) VGPR(__VA_ARGS__) |
111 | # elif CHAR_PER_VEC == 16 |
112 | # define MASK_GPR(reg) VGPR_SZ(reg, 16) |
113 | # else |
114 | # define MASK_GPR(reg) VGPR_SZ(reg, 8) |
115 | # endif |
116 | |
117 | # define VMATCH VMM(0) |
118 | # define VMATCH_LO VMM_lo(0) |
119 | |
120 | # define PAGE_SIZE 4096 |
121 | |
122 | |
123 | .section SECTION(.text), "ax" , @progbits |
124 | ENTRY_P2ALIGN (MEMCHR, 6) |
125 | /* Check for zero length. */ |
126 | test %RDX_LP, %RDX_LP |
127 | jz L(zero_0) |
128 | |
129 | # ifdef __ILP32__ |
130 | /* Clear the upper 32 bits. */ |
131 | movl %edx, %edx |
132 | # endif |
133 | VPBROADCAST %esi, %VMATCH |
134 | /* Check if we may cross page boundary with one vector load. */ |
135 | movl %edi, %eax |
136 | andl $(PAGE_SIZE - 1), %eax |
137 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
138 | ja L(page_cross) |
139 | |
140 | VPCMPEQ (%rdi), %VMATCH, %k0 |
141 | KMOV %k0, %VRAX |
142 | # ifndef USE_AS_WMEMCHR |
143 | /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a |
144 | already a dependency between rcx and rsi so no worries about |
145 | false-dep here. */ |
146 | tzcnt %VRAX, %VRSI |
147 | /* If rdx <= rsi then either 1) rcx was non-zero (there was a |
148 | match) but it was out of bounds or 2) rcx was zero and rdx |
149 | was <= VEC_SIZE so we are done scanning. */ |
150 | cmpq %rsi, %rdx |
151 | /* NB: Use branch to return zero/non-zero. Common usage will |
152 | branch on result of function (if return is null/non-null). |
153 | This branch can be used to predict the ensuing one so there |
154 | is no reason to extend the data-dependency with cmovcc. */ |
155 | jbe L(zero_0) |
156 | |
157 | /* If rcx is zero then len must be > RDX, otherwise since we |
158 | already tested len vs lzcnt(rcx) (in rsi) we are good to |
159 | return this match. */ |
160 | test %VRAX, %VRAX |
161 | jz L(more_1x_vec) |
162 | leaq (%rdi, %rsi), %rax |
163 | # else |
164 | |
165 | /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE |
166 | > 1 so if rcx is tzcnt != CHAR_PER_VEC. */ |
167 | cmpq $CHAR_PER_VEC, %rdx |
168 | ja L(more_1x_vec) |
169 | tzcnt %VRAX, %VRAX |
170 | cmpl %eax, %edx |
171 | jbe L(zero_0) |
172 | L(first_vec_x0_ret): |
173 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
174 | # endif |
175 | ret |
176 | |
177 | /* Only fits in first cache line for VEC_SIZE == 32. */ |
178 | # if VEC_SIZE == 32 |
179 | .p2align 4,, 2 |
180 | L(zero_0): |
181 | xorl %eax, %eax |
182 | ret |
183 | # endif |
184 | |
185 | .p2align 4,, 9 |
186 | L(more_1x_vec): |
187 | # ifdef USE_AS_WMEMCHR |
188 | /* If wmemchr still need to test if there was a match in first |
189 | VEC. Use bsf to test here so we can reuse |
190 | L(first_vec_x0_ret). */ |
191 | bsf %VRAX, %VRAX |
192 | jnz L(first_vec_x0_ret) |
193 | # endif |
194 | |
195 | L(page_cross_continue): |
196 | # ifdef USE_AS_WMEMCHR |
197 | /* We can't use end of the buffer to re-calculate length for |
198 | wmemchr as len * CHAR_SIZE may overflow. */ |
199 | leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax |
200 | andq $(VEC_SIZE * -1), %rdi |
201 | subq %rdi, %rax |
202 | sarq $2, %rax |
203 | addq %rdx, %rax |
204 | # else |
205 | leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax |
206 | andq $(VEC_SIZE * -1), %rdi |
207 | subq %rdi, %rax |
208 | # endif |
209 | |
210 | /* rax contains remaining length - 1. -1 so we can get imm8 |
211 | encoding in a few additional places saving code size. */ |
212 | |
213 | /* Needed regardless of remaining length. */ |
214 | VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 |
215 | KMOV %k0, %VRDX |
216 | |
217 | /* We cannot fold the above `sub %rdi, %rax` with the `cmp |
218 | $(CHAR_PER_VEC * 2), %rax` because its possible for a very |
219 | large length to overflow and cause the subtract to carry |
220 | despite length being above CHAR_PER_VEC * 2. */ |
221 | cmpq $(CHAR_PER_VEC * 2 - 1), %rax |
222 | ja L(more_2x_vec) |
223 | L(last_2x_vec): |
224 | |
225 | test %VRDX, %VRDX |
226 | jnz L(first_vec_x1_check) |
227 | |
228 | /* Check the end of data. NB: use 8-bit operations to save code |
229 | size. We no longer need the full-width of eax and will |
230 | perform a write-only operation over eax so there will be no |
231 | partial-register stalls. */ |
232 | subb $(CHAR_PER_VEC * 1 - 1), %al |
233 | jle L(zero_0) |
234 | |
235 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 |
236 | KMOV %k0, %VRCX |
237 | # ifdef USE_AS_WMEMCHR |
238 | /* For wmemchr against we can't take advantage of tzcnt(0) == |
239 | VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ |
240 | test %VRCX, %VRCX |
241 | jz L(zero_0) |
242 | # endif |
243 | tzcnt %VRCX, %VRCX |
244 | cmp %cl, %al |
245 | |
246 | /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give |
247 | fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is |
248 | not enough space before the next cache line to fit the `lea` |
249 | for return. */ |
250 | # if VEC_SIZE == 64 |
251 | ja L(first_vec_x2_ret) |
252 | L(zero_0): |
253 | xorl %eax, %eax |
254 | ret |
255 | # else |
256 | jbe L(zero_0) |
257 | leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax |
258 | ret |
259 | # endif |
260 | |
261 | .p2align 4,, 5 |
262 | L(first_vec_x1_check): |
263 | bsf %VRDX, %VRDX |
264 | cmpb %dl, %al |
265 | jb L(zero_4) |
266 | leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax |
267 | ret |
268 | |
269 | /* Fits at the end of the cache line here for VEC_SIZE == 32. |
270 | */ |
271 | # if VEC_SIZE == 32 |
272 | L(zero_4): |
273 | xorl %eax, %eax |
274 | ret |
275 | # endif |
276 | |
277 | |
278 | .p2align 4,, 4 |
279 | L(first_vec_x2): |
280 | bsf %VRCX, %VRCX |
281 | L(first_vec_x2_ret): |
282 | leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax |
283 | ret |
284 | |
285 | /* Fits at the end of the cache line here for VEC_SIZE == 64. |
286 | */ |
287 | # if VEC_SIZE == 64 |
288 | L(zero_4): |
289 | xorl %eax, %eax |
290 | ret |
291 | # endif |
292 | |
293 | .p2align 4,, 4 |
294 | L(first_vec_x1): |
295 | bsf %VRDX, %VRDX |
296 | leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax |
297 | ret |
298 | |
299 | |
300 | .p2align 4,, 5 |
301 | L(more_2x_vec): |
302 | /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking |
303 | length. */ |
304 | |
305 | |
306 | /* Already computed matches for first VEC in rdx. */ |
307 | test %VRDX, %VRDX |
308 | jnz L(first_vec_x1) |
309 | |
310 | |
311 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 |
312 | KMOV %k0, %VRCX |
313 | test %VRCX, %VRCX |
314 | jnz L(first_vec_x2) |
315 | |
316 | /* Needed regardless of next length check. */ |
317 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 |
318 | KMOV %k0, %VRCX |
319 | |
320 | /* Check if we are near the end. */ |
321 | cmpq $(CHAR_PER_VEC * 4 - 1), %rax |
322 | ja L(more_4x_vec) |
323 | |
324 | test %VRCX, %VRCX |
325 | jnz L(first_vec_x3_check) |
326 | |
327 | /* Use 8-bit instructions to save code size. We won't use full- |
328 | width eax again and will perform a write-only operation to |
329 | eax so no worries about partial-register stalls. */ |
330 | subb $(CHAR_PER_VEC * 3), %al |
331 | jb L(zero_2) |
332 | L(last_vec_check): |
333 | VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 |
334 | KMOV %k0, %VRCX |
335 | # ifdef USE_AS_WMEMCHR |
336 | /* For wmemchr against we can't take advantage of tzcnt(0) == |
337 | VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ |
338 | test %VRCX, %VRCX |
339 | jz L(zero_2) |
340 | # endif |
341 | tzcnt %VRCX, %VRCX |
342 | cmp %cl, %al |
343 | jae L(first_vec_x4_ret) |
344 | L(zero_2): |
345 | xorl %eax, %eax |
346 | ret |
347 | |
348 | /* Fits at the end of the cache line here for VEC_SIZE == 64. |
349 | For VEC_SIZE == 32 we put the return label at the end of |
350 | L(first_vec_x4). */ |
351 | # if VEC_SIZE == 64 |
352 | L(first_vec_x4_ret): |
353 | leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax |
354 | ret |
355 | # endif |
356 | |
357 | .p2align 4,, 6 |
358 | L(first_vec_x4): |
359 | bsf %VRCX, %VRCX |
360 | # if VEC_SIZE == 32 |
361 | /* Place L(first_vec_x4_ret) here as we can't fit it in the same |
362 | cache line as where it is called from so we might as well |
363 | save code size by reusing return of L(first_vec_x4). */ |
364 | L(first_vec_x4_ret): |
365 | # endif |
366 | leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax |
367 | ret |
368 | |
369 | .p2align 4,, 6 |
370 | L(first_vec_x3_check): |
371 | /* Need to adjust remaining length before checking. */ |
372 | addb $-(CHAR_PER_VEC * 2), %al |
373 | bsf %VRCX, %VRCX |
374 | cmpb %cl, %al |
375 | jb L(zero_2) |
376 | leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax |
377 | ret |
378 | |
379 | .p2align 4,, 6 |
380 | L(first_vec_x3): |
381 | bsf %VRCX, %VRCX |
382 | leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax |
383 | ret |
384 | |
385 | .p2align 4,, 3 |
386 | # if !USE_TERN_IN_LOOP |
387 | .p2align 4,, 10 |
388 | # endif |
389 | L(more_4x_vec): |
390 | test %VRCX, %VRCX |
391 | jnz L(first_vec_x3) |
392 | |
393 | VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 |
394 | KMOV %k0, %VRCX |
395 | test %VRCX, %VRCX |
396 | jnz L(first_vec_x4) |
397 | |
398 | subq $-(VEC_SIZE * 5), %rdi |
399 | subq $(CHAR_PER_VEC * 8), %rax |
400 | jb L(last_4x_vec) |
401 | |
402 | # ifdef USE_AS_WMEMCHR |
403 | movl %edi, %ecx |
404 | # else |
405 | addq %rdi, %rax |
406 | # endif |
407 | |
408 | |
409 | # if VEC_SIZE == 64 |
410 | /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex |
411 | processor has partial register stalls (all have merging |
412 | uop). If that changes this can be removed. */ |
413 | xorb %dil, %dil |
414 | # else |
415 | andq $-(VEC_SIZE * 4), %rdi |
416 | # endif |
417 | |
418 | # ifdef USE_AS_WMEMCHR |
419 | subl %edi, %ecx |
420 | sarl $2, %ecx |
421 | addq %rcx, %rax |
422 | # else |
423 | subq %rdi, %rax |
424 | # endif |
425 | |
426 | |
427 | |
428 | # if USE_TERN_IN_LOOP |
429 | /* copy VMATCH to low ymm so we can use vpcmpeq which is not |
430 | encodable with EVEX registers. NB: this is VEC_SIZE == 32 |
431 | only as there is no way to encode vpcmpeq with zmm0-15. */ |
432 | vmovdqa64 %VMATCH, %VMATCH_LO |
433 | # endif |
434 | |
435 | .p2align 4,, 11 |
436 | L(loop_4x_vec): |
437 | /* Two versions of the loop. One that does not require |
438 | vzeroupper by not using ymmm0-15 and another does that |
439 | require vzeroupper because it uses ymmm0-15. The reason why |
440 | ymm0-15 is used at all is because there is no EVEX encoding |
441 | vpcmpeq and with vpcmpeq this loop can be performed more |
442 | efficiently. The non-vzeroupper version is safe for RTM |
443 | while the vzeroupper version should be prefered if RTM are |
444 | not supported. Which loop version we use is determined by |
445 | USE_TERN_IN_LOOP. */ |
446 | |
447 | # if USE_TERN_IN_LOOP |
448 | /* Since vptern can only take 3x vectors fastest to do 1 vec |
449 | seperately with EVEX vpcmp. */ |
450 | # ifdef USE_AS_WMEMCHR |
451 | /* vptern can only accept masks for epi32/epi64 so can only save |
452 | instruction using not equals mask on vptern with wmemchr. |
453 | */ |
454 | VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 |
455 | # else |
456 | VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 |
457 | # endif |
458 | /* Compare 3x with vpcmpeq and or them all together with vptern. |
459 | */ |
460 | VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2) |
461 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) |
462 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) |
463 | # ifdef USE_AS_WMEMCHR |
464 | /* This takes the not of or between VEC_lo(2), VEC_lo(3), |
465 | VEC_lo(4) as well as combines result from VEC(0) with zero |
466 | mask. */ |
467 | vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z} |
468 | vpmovmskb %VMM_lo(4), %VRCX |
469 | # else |
470 | /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into |
471 | VEC_lo(4). */ |
472 | vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) |
473 | vpmovmskb %VMM_lo(4), %VRCX |
474 | KMOV %k1, %edx |
475 | # endif |
476 | |
477 | # else |
478 | /* Loop version that uses EVEX encoding. */ |
479 | VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 |
480 | vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2) |
481 | vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3) |
482 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3 |
483 | VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} |
484 | VPTESTN %VMM(3), %VMM(3), %k2 |
485 | # endif |
486 | |
487 | |
488 | TEST_END () |
489 | jnz L(loop_vec_ret) |
490 | |
491 | subq $-(VEC_SIZE * 4), %rdi |
492 | |
493 | subq $(CHAR_PER_VEC * 4), %rax |
494 | jae L(loop_4x_vec) |
495 | |
496 | /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop. |
497 | */ |
498 | COND_VZEROUPPER |
499 | |
500 | .p2align 4,, 10 |
501 | L(last_4x_vec): |
502 | /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit |
503 | instructions on eax from here on out. */ |
504 | # if CHAR_PER_VEC != 64 |
505 | andl $(CHAR_PER_VEC * 4 - 1), %eax |
506 | # endif |
507 | VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0 |
508 | subq $(VEC_SIZE * 1), %rdi |
509 | KMOV %k0, %VRDX |
510 | cmpb $(CHAR_PER_VEC * 2 - 1), %al |
511 | jbe L(last_2x_vec) |
512 | test %VRDX, %VRDX |
513 | jnz L(last_vec_x1_novzero) |
514 | |
515 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 |
516 | KMOV %k0, %VRDX |
517 | test %VRDX, %VRDX |
518 | jnz L(last_vec_x2_novzero) |
519 | |
520 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 |
521 | KMOV %k0, %VRCX |
522 | test %VRCX, %VRCX |
523 | jnz L(first_vec_x3_check) |
524 | |
525 | subb $(CHAR_PER_VEC * 3), %al |
526 | jae L(last_vec_check) |
527 | |
528 | xorl %eax, %eax |
529 | ret |
530 | |
531 | # if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP |
532 | L(last_vec_x2_novzero): |
533 | addq $VEC_SIZE, %rdi |
534 | L(last_vec_x1_novzero): |
535 | bsf %VRDX, %VRDX |
536 | leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax |
537 | ret |
538 | # endif |
539 | |
540 | # if CHAR_PER_VEC == 64 |
541 | /* Since we can't combine the last 2x VEC when CHAR_PER_VEC == |
542 | 64 it needs a seperate return label. */ |
543 | .p2align 4,, 4 |
544 | L(last_vec_x2): |
545 | L(last_vec_x2_novzero): |
546 | bsf %VRDX, %VRDX |
547 | leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax |
548 | ret |
549 | # endif |
550 | |
551 | .p2align 4,, 4 |
552 | L(loop_vec_ret): |
553 | # if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP |
554 | KMOV %k1, %VRAX |
555 | inc %MASK_GPR(rax) |
556 | # else |
557 | test %VRDX, %VRDX |
558 | # endif |
559 | jnz L(last_vec_x0) |
560 | |
561 | |
562 | # if USE_TERN_IN_LOOP |
563 | vpmovmskb %VMM_lo(2), %VRDX |
564 | # else |
565 | VPTESTN %VMM(2), %VMM(2), %k1 |
566 | KMOV %k1, %VRDX |
567 | # endif |
568 | test %VRDX, %VRDX |
569 | jnz L(last_vec_x1) |
570 | |
571 | |
572 | # if USE_TERN_IN_LOOP |
573 | vpmovmskb %VMM_lo(3), %VRDX |
574 | # else |
575 | KMOV %k2, %VRDX |
576 | # endif |
577 | |
578 | /* No longer need any of the lo vecs (ymm0-15) so vzeroupper |
579 | (only if used VEX encoded loop). */ |
580 | COND_VZEROUPPER |
581 | |
582 | /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For |
583 | CHAR_PER_VEC we test the last 2x VEC seperately, for |
584 | CHAR_PER_VEC <= 32 we can combine the results from the 2x |
585 | VEC in a single GPR. */ |
586 | # if CHAR_PER_VEC == 64 |
587 | # if USE_TERN_IN_LOOP |
588 | # error "Unsupported" |
589 | # endif |
590 | |
591 | |
592 | /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ |
593 | test %VRDX, %VRDX |
594 | jnz L(last_vec_x2) |
595 | KMOV %k3, %VRDX |
596 | # else |
597 | /* CHAR_PER_VEC <= 32 so we can combine the results from the |
598 | last 2x VEC. */ |
599 | |
600 | # if !USE_TERN_IN_LOOP |
601 | KMOV %k3, %VRCX |
602 | # endif |
603 | salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx |
604 | addq %rcx, %rdx |
605 | # if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP |
606 | L(last_vec_x2_novzero): |
607 | # endif |
608 | # endif |
609 | bsf %rdx, %rdx |
610 | leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax |
611 | ret |
612 | |
613 | .p2align 4,, 8 |
614 | L(last_vec_x1): |
615 | COND_VZEROUPPER |
616 | # if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP |
617 | L(last_vec_x1_novzero): |
618 | # endif |
619 | bsf %VRDX, %VRDX |
620 | leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax |
621 | ret |
622 | |
623 | |
624 | .p2align 4,, 4 |
625 | L(last_vec_x0): |
626 | COND_VZEROUPPER |
627 | bsf %VGPR(GPR_X0), %VGPR(GPR_X0) |
628 | # if GPR_X0_IS_RET |
629 | addq %rdi, %rax |
630 | # else |
631 | leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax |
632 | # endif |
633 | ret |
634 | |
635 | .p2align 4,, 6 |
636 | L(page_cross): |
637 | /* Need to preserve eax to compute inbound bytes we are |
638 | checking. */ |
639 | # ifdef USE_AS_WMEMCHR |
640 | movl %eax, %ecx |
641 | # else |
642 | xorl %ecx, %ecx |
643 | subl %eax, %ecx |
644 | # endif |
645 | |
646 | xorq %rdi, %rax |
647 | VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 |
648 | KMOV %k0, %VRAX |
649 | |
650 | # ifdef USE_AS_WMEMCHR |
651 | /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */ |
652 | shrl $2, %ecx |
653 | andl $(CHAR_PER_VEC - 1), %ecx |
654 | # endif |
655 | |
656 | |
657 | shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX |
658 | |
659 | # ifdef USE_AS_WMEMCHR |
660 | negl %ecx |
661 | # endif |
662 | |
663 | /* mask lower bits from ecx (negative eax) to get bytes till |
664 | next VEC. */ |
665 | andl $(CHAR_PER_VEC - 1), %ecx |
666 | |
667 | /* Check if VEC is entirely contained in the remainder of the |
668 | page. */ |
669 | cmpq %rcx, %rdx |
670 | jbe L(page_cross_ret) |
671 | |
672 | /* Length crosses the page so if rax is zero (no matches) |
673 | continue. */ |
674 | test %VRAX, %VRAX |
675 | jz L(page_cross_continue) |
676 | |
677 | /* if rdx > rcx then any match here must be in [buf:buf + len]. |
678 | */ |
679 | tzcnt %VRAX, %VRAX |
680 | # ifdef USE_AS_WMEMCHR |
681 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
682 | # else |
683 | addq %rdi, %rax |
684 | # endif |
685 | ret |
686 | |
687 | .p2align 4,, 2 |
688 | L(page_cross_zero): |
689 | xorl %eax, %eax |
690 | ret |
691 | |
692 | .p2align 4,, 4 |
693 | L(page_cross_ret): |
694 | /* Search is entirely contained in page cross case. */ |
695 | # ifdef USE_AS_WMEMCHR |
696 | test %VRAX, %VRAX |
697 | jz L(page_cross_zero) |
698 | # endif |
699 | tzcnt %VRAX, %VRAX |
700 | cmpl %eax, %edx |
701 | jbe L(page_cross_zero) |
702 | # ifdef USE_AS_WMEMCHR |
703 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
704 | # else |
705 | addq %rdi, %rax |
706 | # endif |
707 | ret |
708 | END (MEMCHR) |
709 | #endif |
710 | |