1/* memchr/wmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCHR
24# define MEMCHR __memchr_evex
25# endif
26
27# ifdef USE_AS_WMEMCHR
28# define VPBROADCAST vpbroadcastd
29# define VPMINU vpminud
30# define VPCMP vpcmpd
31# define VPCMPEQ vpcmpeqd
32# define CHAR_SIZE 4
33# else
34# define VPBROADCAST vpbroadcastb
35# define VPMINU vpminub
36# define VPCMP vpcmpb
37# define VPCMPEQ vpcmpeqb
38# define CHAR_SIZE 1
39# endif
40
41 /* In the 4x loop the RTM and non-RTM versions have data pointer
42 off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
43 This is represented by BASE_OFFSET. As well because the RTM
44 version uses vpcmp which stores a bit per element compared where
45 the non-RTM version uses vpcmpeq which stores a bit per byte
46 compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
47 version. */
48# ifdef USE_IN_RTM
49# define VZEROUPPER
50# define BASE_OFFSET (VEC_SIZE * 4)
51# define RET_SCALE CHAR_SIZE
52# else
53# define VZEROUPPER vzeroupper
54# define BASE_OFFSET 0
55# define RET_SCALE 1
56# endif
57
58 /* In the return from 4x loop memchr and rawmemchr versions have
59 data pointers off by VEC_SIZE * 4 with memchr version being
60 VEC_SIZE * 4 greater. */
61# ifdef USE_AS_RAWMEMCHR
62# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
63# define RAW_PTR_REG rcx
64# define ALGN_PTR_REG rdi
65# else
66# define RET_OFFSET BASE_OFFSET
67# define RAW_PTR_REG rdi
68# define ALGN_PTR_REG rcx
69# endif
70
71# define XMMZERO xmm23
72# define YMMZERO ymm23
73# define XMMMATCH xmm16
74# define YMMMATCH ymm16
75# define YMM1 ymm17
76# define YMM2 ymm18
77# define YMM3 ymm19
78# define YMM4 ymm20
79# define YMM5 ymm21
80# define YMM6 ymm22
81
82# ifndef SECTION
83# define SECTION(p) p##.evex
84# endif
85
86# define VEC_SIZE 32
87# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
88# define PAGE_SIZE 4096
89
90 .section SECTION(.text),"ax",@progbits
91ENTRY (MEMCHR)
92# ifndef USE_AS_RAWMEMCHR
93 /* Check for zero length. */
94 test %RDX_LP, %RDX_LP
95 jz L(zero)
96
97# ifdef __ILP32__
98 /* Clear the upper 32 bits. */
99 movl %edx, %edx
100# endif
101# endif
102 /* Broadcast CHAR to YMMMATCH. */
103 VPBROADCAST %esi, %YMMMATCH
104 /* Check if we may cross page boundary with one vector load. */
105 movl %edi, %eax
106 andl $(PAGE_SIZE - 1), %eax
107 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
108 ja L(cross_page_boundary)
109
110 /* Check the first VEC_SIZE bytes. */
111 VPCMP $0, (%rdi), %YMMMATCH, %k0
112 kmovd %k0, %eax
113# ifndef USE_AS_RAWMEMCHR
114 /* If length < CHAR_PER_VEC handle special. */
115 cmpq $CHAR_PER_VEC, %rdx
116 jbe L(first_vec_x0)
117# endif
118 testl %eax, %eax
119 jz L(aligned_more)
120 tzcntl %eax, %eax
121# ifdef USE_AS_WMEMCHR
122 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
123 leaq (%rdi, %rax, CHAR_SIZE), %rax
124# else
125 addq %rdi, %rax
126# endif
127 ret
128
129# ifndef USE_AS_RAWMEMCHR
130L(zero):
131 xorl %eax, %eax
132 ret
133
134 .p2align 5
135L(first_vec_x0):
136 /* Check if first match was before length. */
137 tzcntl %eax, %eax
138 xorl %ecx, %ecx
139 cmpl %eax, %edx
140 leaq (%rdi, %rax, CHAR_SIZE), %rax
141 cmovle %rcx, %rax
142 ret
143# else
144 /* NB: first_vec_x0 is 17 bytes which will leave
145 cross_page_boundary (which is relatively cold) close enough
146 to ideal alignment. So only realign L(cross_page_boundary) if
147 rawmemchr. */
148 .p2align 4
149# endif
150L(cross_page_boundary):
151 /* Save pointer before aligning as its original value is
152 necessary for computer return address if byte is found or
153 adjusting length if it is not and this is memchr. */
154 movq %rdi, %rcx
155 /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
156 for rawmemchr. */
157 andq $-VEC_SIZE, %ALGN_PTR_REG
158 VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
159 kmovd %k0, %r8d
160# ifdef USE_AS_WMEMCHR
161 /* NB: Divide shift count by 4 since each bit in K0 represent 4
162 bytes. */
163 sarl $2, %eax
164# endif
165# ifndef USE_AS_RAWMEMCHR
166 movl $(PAGE_SIZE / CHAR_SIZE), %esi
167 subl %eax, %esi
168# endif
169# ifdef USE_AS_WMEMCHR
170 andl $(CHAR_PER_VEC - 1), %eax
171# endif
172 /* Remove the leading bytes. */
173 sarxl %eax, %r8d, %eax
174# ifndef USE_AS_RAWMEMCHR
175 /* Check the end of data. */
176 cmpq %rsi, %rdx
177 jbe L(first_vec_x0)
178# endif
179 testl %eax, %eax
180 jz L(cross_page_continue)
181 tzcntl %eax, %eax
182# ifdef USE_AS_WMEMCHR
183 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
184 leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
185# else
186 addq %RAW_PTR_REG, %rax
187# endif
188 ret
189
190 .p2align 4
191L(first_vec_x1):
192 tzcntl %eax, %eax
193 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
194 ret
195
196 .p2align 4
197L(first_vec_x2):
198 tzcntl %eax, %eax
199 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
200 ret
201
202 .p2align 4
203L(first_vec_x3):
204 tzcntl %eax, %eax
205 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
206 ret
207
208 .p2align 4
209L(first_vec_x4):
210 tzcntl %eax, %eax
211 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
212 ret
213
214 .p2align 5
215L(aligned_more):
216 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
217 since data is only aligned to VEC_SIZE. */
218
219# ifndef USE_AS_RAWMEMCHR
220 /* Align data to VEC_SIZE. */
221L(cross_page_continue):
222 xorl %ecx, %ecx
223 subl %edi, %ecx
224 andq $-VEC_SIZE, %rdi
225 /* esi is for adjusting length to see if near the end. */
226 leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
227# ifdef USE_AS_WMEMCHR
228 /* NB: Divide bytes by 4 to get the wchar_t count. */
229 sarl $2, %esi
230# endif
231# else
232 andq $-VEC_SIZE, %rdi
233L(cross_page_continue):
234# endif
235 /* Load first VEC regardless. */
236 VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
237 kmovd %k0, %eax
238# ifndef USE_AS_RAWMEMCHR
239 /* Adjust length. If near end handle specially. */
240 subq %rsi, %rdx
241 jbe L(last_4x_vec_or_less)
242# endif
243 testl %eax, %eax
244 jnz L(first_vec_x1)
245
246 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
247 kmovd %k0, %eax
248 testl %eax, %eax
249 jnz L(first_vec_x2)
250
251 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
252 kmovd %k0, %eax
253 testl %eax, %eax
254 jnz L(first_vec_x3)
255
256 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
257 kmovd %k0, %eax
258 testl %eax, %eax
259 jnz L(first_vec_x4)
260
261
262# ifndef USE_AS_RAWMEMCHR
263 /* Check if at last CHAR_PER_VEC * 4 length. */
264 subq $(CHAR_PER_VEC * 4), %rdx
265 jbe L(last_4x_vec_or_less_cmpeq)
266 /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
267 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
268
269 /* Align data to VEC_SIZE * 4 for the loop and readjust length.
270 */
271# ifdef USE_AS_WMEMCHR
272 movl %edi, %ecx
273 andq $-(4 * VEC_SIZE), %rdi
274 subl %edi, %ecx
275 /* NB: Divide bytes by 4 to get the wchar_t count. */
276 sarl $2, %ecx
277 addq %rcx, %rdx
278# else
279 addq %rdi, %rdx
280 andq $-(4 * VEC_SIZE), %rdi
281 subq %rdi, %rdx
282# endif
283# else
284 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
285 andq $-(4 * VEC_SIZE), %rdi
286# endif
287# ifdef USE_IN_RTM
288 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
289# else
290 /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
291 encodable with EVEX registers (ymm16-ymm31). */
292 vmovdqa64 %YMMMATCH, %ymm0
293# endif
294
295 /* Compare 4 * VEC at a time forward. */
296 .p2align 4
297L(loop_4x_vec):
298 /* Two versions of the loop. One that does not require
299 vzeroupper by not using ymm0-ymm15 and another does that require
300 vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
301 is used at all is because there is no EVEX encoding vpcmpeq and
302 with vpcmpeq this loop can be performed more efficiently. The
303 non-vzeroupper version is safe for RTM while the vzeroupper
304 version should be prefered if RTM are not supported. */
305# ifdef USE_IN_RTM
306 /* It would be possible to save some instructions using 4x VPCMP
307 but bottleneck on port 5 makes it not woth it. */
308 VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
309 /* xor will set bytes match esi to zero. */
310 vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
311 vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
312 VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
313 /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
314 VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
315 VPCMP $0, %YMM3, %YMMZERO, %k2
316# else
317 /* Since vptern can only take 3x vectors fastest to do 1 vec
318 seperately with EVEX vpcmp. */
319# ifdef USE_AS_WMEMCHR
320 /* vptern can only accept masks for epi32/epi64 so can only save
321 instruction using not equals mask on vptern with wmemchr. */
322 VPCMP $4, (%rdi), %YMMMATCH, %k1
323# else
324 VPCMP $0, (%rdi), %YMMMATCH, %k1
325# endif
326 /* Compare 3x with vpcmpeq and or them all together with vptern.
327 */
328 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
329 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
330 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
331# ifdef USE_AS_WMEMCHR
332 /* This takes the not of or between ymm2, ymm3, ymm4 as well as
333 combines result from VEC0 with zero mask. */
334 vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
335 vpmovmskb %ymm4, %ecx
336# else
337 /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
338 vpternlogd $254, %ymm2, %ymm3, %ymm4
339 vpmovmskb %ymm4, %ecx
340 kmovd %k1, %eax
341# endif
342# endif
343
344# ifdef USE_AS_RAWMEMCHR
345 subq $-(VEC_SIZE * 4), %rdi
346# endif
347# ifdef USE_IN_RTM
348 kortestd %k2, %k3
349# else
350# ifdef USE_AS_WMEMCHR
351 /* ecx contains not of matches. All 1s means no matches. incl will
352 overflow and set zeroflag if that is the case. */
353 incl %ecx
354# else
355 /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
356 to ecx is not an issue because if eax is non-zero it will be
357 used for returning the match. If it is zero the add does
358 nothing. */
359 addq %rax, %rcx
360# endif
361# endif
362# ifdef USE_AS_RAWMEMCHR
363 jz L(loop_4x_vec)
364# else
365 jnz L(loop_4x_vec_end)
366
367 subq $-(VEC_SIZE * 4), %rdi
368
369 subq $(CHAR_PER_VEC * 4), %rdx
370 ja L(loop_4x_vec)
371
372 /* Fall through into less than 4 remaining vectors of length case.
373 */
374 VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
375 addq $(BASE_OFFSET - VEC_SIZE), %rdi
376 kmovd %k0, %eax
377 VZEROUPPER
378
379L(last_4x_vec_or_less):
380 /* Check if first VEC contained match. */
381 testl %eax, %eax
382 jnz L(first_vec_x1_check)
383
384 /* If remaining length > CHAR_PER_VEC * 2. */
385 addl $(CHAR_PER_VEC * 2), %edx
386 jg L(last_4x_vec)
387
388L(last_2x_vec):
389 /* If remaining length < CHAR_PER_VEC. */
390 addl $CHAR_PER_VEC, %edx
391 jle L(zero_end)
392
393 /* Check VEC2 and compare any match with remaining length. */
394 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
395 kmovd %k0, %eax
396 tzcntl %eax, %eax
397 cmpl %eax, %edx
398 jbe L(set_zero_end)
399 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
400L(zero_end):
401 ret
402
403
404 .p2align 4
405L(first_vec_x1_check):
406 tzcntl %eax, %eax
407 /* Adjust length. */
408 subl $-(CHAR_PER_VEC * 4), %edx
409 /* Check if match within remaining length. */
410 cmpl %eax, %edx
411 jbe L(set_zero_end)
412 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
413 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
414 ret
415L(set_zero_end):
416 xorl %eax, %eax
417 ret
418
419 .p2align 4
420L(loop_4x_vec_end):
421# endif
422 /* rawmemchr will fall through into this if match was found in
423 loop. */
424
425# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
426 /* k1 has not of matches with VEC1. */
427 kmovd %k1, %eax
428# ifdef USE_AS_WMEMCHR
429 subl $((1 << CHAR_PER_VEC) - 1), %eax
430# else
431 incl %eax
432# endif
433# else
434 /* eax already has matches for VEC1. */
435 testl %eax, %eax
436# endif
437 jnz L(last_vec_x1_return)
438
439# ifdef USE_IN_RTM
440 VPCMP $0, %YMM2, %YMMZERO, %k0
441 kmovd %k0, %eax
442# else
443 vpmovmskb %ymm2, %eax
444# endif
445 testl %eax, %eax
446 jnz L(last_vec_x2_return)
447
448# ifdef USE_IN_RTM
449 kmovd %k2, %eax
450 testl %eax, %eax
451 jnz L(last_vec_x3_return)
452
453 kmovd %k3, %eax
454 tzcntl %eax, %eax
455 leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
456# else
457 vpmovmskb %ymm3, %eax
458 /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
459 salq $VEC_SIZE, %rcx
460 orq %rcx, %rax
461 tzcntq %rax, %rax
462 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
463 VZEROUPPER
464# endif
465 ret
466
467 .p2align 4
468L(last_vec_x1_return):
469 tzcntl %eax, %eax
470# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
471 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
472 leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
473# else
474 addq %rdi, %rax
475# endif
476 VZEROUPPER
477 ret
478
479 .p2align 4
480L(last_vec_x2_return):
481 tzcntl %eax, %eax
482 /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
483 if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
484 USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
485 leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
486 VZEROUPPER
487 ret
488
489# ifdef USE_IN_RTM
490 .p2align 4
491L(last_vec_x3_return):
492 tzcntl %eax, %eax
493 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
494 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
495 ret
496# endif
497
498# ifndef USE_AS_RAWMEMCHR
499L(last_4x_vec_or_less_cmpeq):
500 VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
501 kmovd %k0, %eax
502 subq $-(VEC_SIZE * 4), %rdi
503 /* Check first VEC regardless. */
504 testl %eax, %eax
505 jnz L(first_vec_x1_check)
506
507 /* If remaining length <= CHAR_PER_VEC * 2. */
508 addl $(CHAR_PER_VEC * 2), %edx
509 jle L(last_2x_vec)
510
511 .p2align 4
512L(last_4x_vec):
513 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
514 kmovd %k0, %eax
515 testl %eax, %eax
516 jnz L(last_vec_x2)
517
518
519 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
520 kmovd %k0, %eax
521 /* Create mask for possible matches within remaining length. */
522# ifdef USE_AS_WMEMCHR
523 movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
524 bzhil %edx, %ecx, %ecx
525# else
526 movq $-1, %rcx
527 bzhiq %rdx, %rcx, %rcx
528# endif
529 /* Test matches in data against length match. */
530 andl %ecx, %eax
531 jnz L(last_vec_x3)
532
533 /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
534 remaining length was found to be > CHAR_PER_VEC * 2. */
535 subl $CHAR_PER_VEC, %edx
536 jbe L(zero_end2)
537
538
539 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
540 kmovd %k0, %eax
541 /* Shift remaining length mask for last VEC. */
542# ifdef USE_AS_WMEMCHR
543 shrl $CHAR_PER_VEC, %ecx
544# else
545 shrq $CHAR_PER_VEC, %rcx
546# endif
547 andl %ecx, %eax
548 jz L(zero_end2)
549 tzcntl %eax, %eax
550 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
551L(zero_end2):
552 ret
553
554L(last_vec_x2):
555 tzcntl %eax, %eax
556 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
557 ret
558
559 .p2align 4
560L(last_vec_x3):
561 tzcntl %eax, %eax
562 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
563 ret
564# endif
565
566END (MEMCHR)
567#endif
568