1/* memchr/wmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# ifndef MEMCHR
25# define MEMCHR __memchr_evex
26# endif
27
28# ifdef USE_AS_WMEMCHR
29# define VPBROADCAST vpbroadcastd
30# define VPMINU vpminud
31# define VPCMP vpcmpd
32# define VPCMPEQ vpcmpeqd
33# define CHAR_SIZE 4
34# else
35# define VPBROADCAST vpbroadcastb
36# define VPMINU vpminub
37# define VPCMP vpcmpb
38# define VPCMPEQ vpcmpeqb
39# define CHAR_SIZE 1
40# endif
41
42 /* In the 4x loop the RTM and non-RTM versions have data pointer
43 off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
44 This is represented by BASE_OFFSET. As well because the RTM
45 version uses vpcmp which stores a bit per element compared where
46 the non-RTM version uses vpcmpeq which stores a bit per byte
47 compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
48 version. */
49# ifdef USE_IN_RTM
50# define VZEROUPPER
51# define BASE_OFFSET (VEC_SIZE * 4)
52# define RET_SCALE CHAR_SIZE
53# else
54# define VZEROUPPER vzeroupper
55# define BASE_OFFSET 0
56# define RET_SCALE 1
57# endif
58
59 /* In the return from 4x loop memchr and rawmemchr versions have
60 data pointers off by VEC_SIZE * 4 with memchr version being
61 VEC_SIZE * 4 greater. */
62# ifdef USE_AS_RAWMEMCHR
63# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
64# define RAW_PTR_REG rcx
65# define ALGN_PTR_REG rdi
66# else
67# define RET_OFFSET BASE_OFFSET
68# define RAW_PTR_REG rdi
69# define ALGN_PTR_REG rcx
70# endif
71
72# define XMMZERO xmm23
73# define YMMZERO ymm23
74# define XMMMATCH xmm16
75# define YMMMATCH ymm16
76# define YMM1 ymm17
77# define YMM2 ymm18
78# define YMM3 ymm19
79# define YMM4 ymm20
80# define YMM5 ymm21
81# define YMM6 ymm22
82
83# ifndef SECTION
84# define SECTION(p) p##.evex
85# endif
86
87# define VEC_SIZE 32
88# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
89# define PAGE_SIZE 4096
90
91 .section SECTION(.text),"ax",@progbits
92ENTRY_P2ALIGN (MEMCHR, 6)
93# ifndef USE_AS_RAWMEMCHR
94 /* Check for zero length. */
95 test %RDX_LP, %RDX_LP
96 jz L(zero)
97
98# ifdef __ILP32__
99 /* Clear the upper 32 bits. */
100 movl %edx, %edx
101# endif
102# endif
103 /* Broadcast CHAR to YMMMATCH. */
104 VPBROADCAST %esi, %YMMMATCH
105 /* Check if we may cross page boundary with one vector load. */
106 movl %edi, %eax
107 andl $(PAGE_SIZE - 1), %eax
108 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
109 ja L(cross_page_boundary)
110
111 /* Check the first VEC_SIZE bytes. */
112 VPCMP $0, (%rdi), %YMMMATCH, %k0
113 kmovd %k0, %eax
114# ifndef USE_AS_RAWMEMCHR
115 /* If length < CHAR_PER_VEC handle special. */
116 cmpq $CHAR_PER_VEC, %rdx
117 jbe L(first_vec_x0)
118# endif
119 testl %eax, %eax
120 jz L(aligned_more)
121 tzcntl %eax, %eax
122# ifdef USE_AS_WMEMCHR
123 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
124 leaq (%rdi, %rax, CHAR_SIZE), %rax
125# else
126 addq %rdi, %rax
127# endif
128 ret
129
130# ifndef USE_AS_RAWMEMCHR
131L(zero):
132 xorl %eax, %eax
133 ret
134
135 .p2align 4
136L(first_vec_x0):
137 /* Check if first match was before length. NB: tzcnt has false data-
138 dependency on destination. eax already had a data-dependency on esi
139 so this should have no affect here. */
140 tzcntl %eax, %esi
141# ifdef USE_AS_WMEMCHR
142 leaq (%rdi, %rsi, CHAR_SIZE), %rdi
143# else
144 addq %rsi, %rdi
145# endif
146 xorl %eax, %eax
147 cmpl %esi, %edx
148 cmovg %rdi, %rax
149 ret
150# endif
151
152 .p2align 4
153L(cross_page_boundary):
154 /* Save pointer before aligning as its original value is
155 necessary for computer return address if byte is found or
156 adjusting length if it is not and this is memchr. */
157 movq %rdi, %rcx
158 /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
159 for rawmemchr. */
160 andq $-VEC_SIZE, %ALGN_PTR_REG
161 VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
162 kmovd %k0, %r8d
163# ifdef USE_AS_WMEMCHR
164 /* NB: Divide shift count by 4 since each bit in K0 represent 4
165 bytes. */
166 sarl $2, %eax
167# endif
168# ifndef USE_AS_RAWMEMCHR
169 movl $(PAGE_SIZE / CHAR_SIZE), %esi
170 subl %eax, %esi
171# endif
172# ifdef USE_AS_WMEMCHR
173 andl $(CHAR_PER_VEC - 1), %eax
174# endif
175 /* Remove the leading bytes. */
176 sarxl %eax, %r8d, %eax
177# ifndef USE_AS_RAWMEMCHR
178 /* Check the end of data. */
179 cmpq %rsi, %rdx
180 jbe L(first_vec_x0)
181# endif
182 testl %eax, %eax
183 jz L(cross_page_continue)
184 tzcntl %eax, %eax
185# ifdef USE_AS_WMEMCHR
186 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
187 leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
188# else
189 addq %RAW_PTR_REG, %rax
190# endif
191 ret
192
193 .p2align 4
194L(first_vec_x1):
195 tzcntl %eax, %eax
196 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
197 ret
198
199 .p2align 4
200L(first_vec_x2):
201 tzcntl %eax, %eax
202 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
203 ret
204
205 .p2align 4
206L(first_vec_x3):
207 tzcntl %eax, %eax
208 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
209 ret
210
211 .p2align 4
212L(first_vec_x4):
213 tzcntl %eax, %eax
214 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
215 ret
216
217 .p2align 5
218L(aligned_more):
219 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
220 since data is only aligned to VEC_SIZE. */
221
222# ifndef USE_AS_RAWMEMCHR
223 /* Align data to VEC_SIZE. */
224L(cross_page_continue):
225 xorl %ecx, %ecx
226 subl %edi, %ecx
227 andq $-VEC_SIZE, %rdi
228 /* esi is for adjusting length to see if near the end. */
229 leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
230# ifdef USE_AS_WMEMCHR
231 /* NB: Divide bytes by 4 to get the wchar_t count. */
232 sarl $2, %esi
233# endif
234# else
235 andq $-VEC_SIZE, %rdi
236L(cross_page_continue):
237# endif
238 /* Load first VEC regardless. */
239 VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
240 kmovd %k0, %eax
241# ifndef USE_AS_RAWMEMCHR
242 /* Adjust length. If near end handle specially. */
243 subq %rsi, %rdx
244 jbe L(last_4x_vec_or_less)
245# endif
246 testl %eax, %eax
247 jnz L(first_vec_x1)
248
249 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
250 kmovd %k0, %eax
251 testl %eax, %eax
252 jnz L(first_vec_x2)
253
254 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
255 kmovd %k0, %eax
256 testl %eax, %eax
257 jnz L(first_vec_x3)
258
259 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
260 kmovd %k0, %eax
261 testl %eax, %eax
262 jnz L(first_vec_x4)
263
264
265# ifndef USE_AS_RAWMEMCHR
266 /* Check if at last CHAR_PER_VEC * 4 length. */
267 subq $(CHAR_PER_VEC * 4), %rdx
268 jbe L(last_4x_vec_or_less_cmpeq)
269 /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
270 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
271
272 /* Align data to VEC_SIZE * 4 for the loop and readjust length.
273 */
274# ifdef USE_AS_WMEMCHR
275 movl %edi, %ecx
276 andq $-(4 * VEC_SIZE), %rdi
277 subl %edi, %ecx
278 /* NB: Divide bytes by 4 to get the wchar_t count. */
279 sarl $2, %ecx
280 addq %rcx, %rdx
281# else
282 addq %rdi, %rdx
283 andq $-(4 * VEC_SIZE), %rdi
284 subq %rdi, %rdx
285# endif
286# else
287 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
288 andq $-(4 * VEC_SIZE), %rdi
289# endif
290# ifdef USE_IN_RTM
291 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
292# else
293 /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
294 encodable with EVEX registers (ymm16-ymm31). */
295 vmovdqa64 %YMMMATCH, %ymm0
296# endif
297
298 /* Compare 4 * VEC at a time forward. */
299 .p2align 4
300L(loop_4x_vec):
301 /* Two versions of the loop. One that does not require
302 vzeroupper by not using ymm0-ymm15 and another does that require
303 vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
304 is used at all is because there is no EVEX encoding vpcmpeq and
305 with vpcmpeq this loop can be performed more efficiently. The
306 non-vzeroupper version is safe for RTM while the vzeroupper
307 version should be prefered if RTM are not supported. */
308# ifdef USE_IN_RTM
309 /* It would be possible to save some instructions using 4x VPCMP
310 but bottleneck on port 5 makes it not woth it. */
311 VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
312 /* xor will set bytes match esi to zero. */
313 vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
314 vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
315 VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
316 /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
317 VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
318 VPCMP $0, %YMM3, %YMMZERO, %k2
319# else
320 /* Since vptern can only take 3x vectors fastest to do 1 vec
321 seperately with EVEX vpcmp. */
322# ifdef USE_AS_WMEMCHR
323 /* vptern can only accept masks for epi32/epi64 so can only save
324 instruction using not equals mask on vptern with wmemchr. */
325 VPCMP $4, (%rdi), %YMMMATCH, %k1
326# else
327 VPCMP $0, (%rdi), %YMMMATCH, %k1
328# endif
329 /* Compare 3x with vpcmpeq and or them all together with vptern.
330 */
331 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
332 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
333 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
334# ifdef USE_AS_WMEMCHR
335 /* This takes the not of or between ymm2, ymm3, ymm4 as well as
336 combines result from VEC0 with zero mask. */
337 vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
338 vpmovmskb %ymm4, %ecx
339# else
340 /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
341 vpternlogd $254, %ymm2, %ymm3, %ymm4
342 vpmovmskb %ymm4, %ecx
343 kmovd %k1, %eax
344# endif
345# endif
346
347# ifdef USE_AS_RAWMEMCHR
348 subq $-(VEC_SIZE * 4), %rdi
349# endif
350# ifdef USE_IN_RTM
351 kortestd %k2, %k3
352# else
353# ifdef USE_AS_WMEMCHR
354 /* ecx contains not of matches. All 1s means no matches. incl will
355 overflow and set zeroflag if that is the case. */
356 incl %ecx
357# else
358 /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
359 to ecx is not an issue because if eax is non-zero it will be
360 used for returning the match. If it is zero the add does
361 nothing. */
362 addq %rax, %rcx
363# endif
364# endif
365# ifdef USE_AS_RAWMEMCHR
366 jz L(loop_4x_vec)
367# else
368 jnz L(loop_4x_vec_end)
369
370 subq $-(VEC_SIZE * 4), %rdi
371
372 subq $(CHAR_PER_VEC * 4), %rdx
373 ja L(loop_4x_vec)
374
375 /* Fall through into less than 4 remaining vectors of length case.
376 */
377 VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
378 addq $(BASE_OFFSET - VEC_SIZE), %rdi
379 kmovd %k0, %eax
380 VZEROUPPER
381
382L(last_4x_vec_or_less):
383 /* Check if first VEC contained match. */
384 testl %eax, %eax
385 jnz L(first_vec_x1_check)
386
387 /* If remaining length > CHAR_PER_VEC * 2. */
388 addl $(CHAR_PER_VEC * 2), %edx
389 jg L(last_4x_vec)
390
391L(last_2x_vec):
392 /* If remaining length < CHAR_PER_VEC. */
393 addl $CHAR_PER_VEC, %edx
394 jle L(zero_end)
395
396 /* Check VEC2 and compare any match with remaining length. */
397 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
398 kmovd %k0, %eax
399 tzcntl %eax, %eax
400 cmpl %eax, %edx
401 jbe L(set_zero_end)
402 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
403L(zero_end):
404 ret
405
406L(set_zero_end):
407 xorl %eax, %eax
408 ret
409
410 .p2align 4
411L(first_vec_x1_check):
412 /* eax must be non-zero. Use bsfl to save code size. */
413 bsfl %eax, %eax
414 /* Adjust length. */
415 subl $-(CHAR_PER_VEC * 4), %edx
416 /* Check if match within remaining length. */
417 cmpl %eax, %edx
418 jbe L(set_zero_end)
419 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
420 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
421 ret
422
423 .p2align 4
424L(loop_4x_vec_end):
425# endif
426 /* rawmemchr will fall through into this if match was found in
427 loop. */
428
429# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
430 /* k1 has not of matches with VEC1. */
431 kmovd %k1, %eax
432# ifdef USE_AS_WMEMCHR
433 subl $((1 << CHAR_PER_VEC) - 1), %eax
434# else
435 incl %eax
436# endif
437# else
438 /* eax already has matches for VEC1. */
439 testl %eax, %eax
440# endif
441 jnz L(last_vec_x1_return)
442
443# ifdef USE_IN_RTM
444 VPCMP $0, %YMM2, %YMMZERO, %k0
445 kmovd %k0, %eax
446# else
447 vpmovmskb %ymm2, %eax
448# endif
449 testl %eax, %eax
450 jnz L(last_vec_x2_return)
451
452# ifdef USE_IN_RTM
453 kmovd %k2, %eax
454 testl %eax, %eax
455 jnz L(last_vec_x3_return)
456
457 kmovd %k3, %eax
458 tzcntl %eax, %eax
459 leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
460# else
461 vpmovmskb %ymm3, %eax
462 /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
463 salq $VEC_SIZE, %rcx
464 orq %rcx, %rax
465 tzcntq %rax, %rax
466 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
467 VZEROUPPER
468# endif
469 ret
470
471 .p2align 4,, 10
472L(last_vec_x1_return):
473 tzcntl %eax, %eax
474# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
475 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
476 leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
477# else
478 addq %rdi, %rax
479# endif
480 VZEROUPPER
481 ret
482
483 .p2align 4
484L(last_vec_x2_return):
485 tzcntl %eax, %eax
486 /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
487 if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
488 USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
489 leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
490 VZEROUPPER
491 ret
492
493# ifdef USE_IN_RTM
494 .p2align 4
495L(last_vec_x3_return):
496 tzcntl %eax, %eax
497 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
498 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
499 ret
500# endif
501
502# ifndef USE_AS_RAWMEMCHR
503 .p2align 4,, 5
504L(last_4x_vec_or_less_cmpeq):
505 VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
506 kmovd %k0, %eax
507 subq $-(VEC_SIZE * 4), %rdi
508 /* Check first VEC regardless. */
509 testl %eax, %eax
510 jnz L(first_vec_x1_check)
511
512 /* If remaining length <= CHAR_PER_VEC * 2. */
513 addl $(CHAR_PER_VEC * 2), %edx
514 jle L(last_2x_vec)
515
516 .p2align 4
517L(last_4x_vec):
518 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
519 kmovd %k0, %eax
520 testl %eax, %eax
521 jnz L(last_vec_x2)
522
523
524 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
525 kmovd %k0, %eax
526 /* Create mask for possible matches within remaining length. */
527# ifdef USE_AS_WMEMCHR
528 movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
529 bzhil %edx, %ecx, %ecx
530# else
531 movq $-1, %rcx
532 bzhiq %rdx, %rcx, %rcx
533# endif
534 /* Test matches in data against length match. */
535 andl %ecx, %eax
536 jnz L(last_vec_x3)
537
538 /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
539 remaining length was found to be > CHAR_PER_VEC * 2. */
540 subl $CHAR_PER_VEC, %edx
541 jbe L(zero_end2)
542
543
544 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
545 kmovd %k0, %eax
546 /* Shift remaining length mask for last VEC. */
547# ifdef USE_AS_WMEMCHR
548 shrl $CHAR_PER_VEC, %ecx
549# else
550 shrq $CHAR_PER_VEC, %rcx
551# endif
552 andl %ecx, %eax
553 jz L(zero_end2)
554 bsfl %eax, %eax
555 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
556L(zero_end2):
557 ret
558
559L(last_vec_x2):
560 tzcntl %eax, %eax
561 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
562 ret
563
564 .p2align 4
565L(last_vec_x3):
566 tzcntl %eax, %eax
567 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
568 ret
569# endif
570 /* 7 bytes from next cache line. */
571END (MEMCHR)
572#endif
573