1/* strchr/strchrnul optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCHR
24# define STRCHR __strchr_evex
25# endif
26
27# define VMOVU vmovdqu64
28# define VMOVA vmovdqa64
29
30# ifdef USE_AS_WCSCHR
31# define VPBROADCAST vpbroadcastd
32# define VPCMP vpcmpd
33# define VPMINU vpminud
34# define CHAR_REG esi
35# define SHIFT_REG ecx
36# define CHAR_SIZE 4
37# else
38# define VPBROADCAST vpbroadcastb
39# define VPCMP vpcmpb
40# define VPMINU vpminub
41# define CHAR_REG sil
42# define SHIFT_REG edx
43# define CHAR_SIZE 1
44# endif
45
46# define XMMZERO xmm16
47
48# define YMMZERO ymm16
49# define YMM0 ymm17
50# define YMM1 ymm18
51# define YMM2 ymm19
52# define YMM3 ymm20
53# define YMM4 ymm21
54# define YMM5 ymm22
55# define YMM6 ymm23
56# define YMM7 ymm24
57# define YMM8 ymm25
58
59# define VEC_SIZE 32
60# define PAGE_SIZE 4096
61# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
62
63 .section .text.evex,"ax",@progbits
64ENTRY (STRCHR)
65 /* Broadcast CHAR to YMM0. */
66 VPBROADCAST %esi, %YMM0
67 movl %edi, %eax
68 andl $(PAGE_SIZE - 1), %eax
69 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
70
71 /* Check if we cross page boundary with one vector load.
72 Otherwise it is safe to use an unaligned load. */
73 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
74 ja L(cross_page_boundary)
75
76 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
77 null bytes. */
78 VMOVU (%rdi), %YMM1
79
80 /* Leaves only CHARS matching esi as 0. */
81 vpxorq %YMM1, %YMM0, %YMM2
82 VPMINU %YMM2, %YMM1, %YMM2
83 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
84 VPCMP $0, %YMMZERO, %YMM2, %k0
85 kmovd %k0, %eax
86 testl %eax, %eax
87 jz L(aligned_more)
88 tzcntl %eax, %eax
89# ifdef USE_AS_WCSCHR
90 /* NB: Multiply wchar_t count by 4 to get the number of bytes.
91 */
92 leaq (%rdi, %rax, CHAR_SIZE), %rax
93# else
94 addq %rdi, %rax
95# endif
96# ifndef USE_AS_STRCHRNUL
97 /* Found CHAR or the null byte. */
98 cmp (%rax), %CHAR_REG
99 jne L(zero)
100# endif
101 ret
102
103 /* .p2align 5 helps keep performance more consistent if ENTRY()
104 alignment % 32 was either 16 or 0. As well this makes the
105 alignment % 32 of the loop_4x_vec fixed which makes tuning it
106 easier. */
107 .p2align 5
108L(first_vec_x3):
109 tzcntl %eax, %eax
110# ifndef USE_AS_STRCHRNUL
111 /* Found CHAR or the null byte. */
112 cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
113 jne L(zero)
114# endif
115 /* NB: Multiply sizeof char type (1 or 4) to get the number of
116 bytes. */
117 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
118 ret
119
120# ifndef USE_AS_STRCHRNUL
121L(zero):
122 xorl %eax, %eax
123 ret
124# endif
125
126 .p2align 4
127L(first_vec_x4):
128# ifndef USE_AS_STRCHRNUL
129 /* Check to see if first match was CHAR (k0) or null (k1). */
130 kmovd %k0, %eax
131 tzcntl %eax, %eax
132 kmovd %k1, %ecx
133 /* bzhil will not be 0 if first match was null. */
134 bzhil %eax, %ecx, %ecx
135 jne L(zero)
136# else
137 /* Combine CHAR and null matches. */
138 kord %k0, %k1, %k0
139 kmovd %k0, %eax
140 tzcntl %eax, %eax
141# endif
142 /* NB: Multiply sizeof char type (1 or 4) to get the number of
143 bytes. */
144 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
145 ret
146
147 .p2align 4
148L(first_vec_x1):
149 tzcntl %eax, %eax
150# ifndef USE_AS_STRCHRNUL
151 /* Found CHAR or the null byte. */
152 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
153 jne L(zero)
154
155# endif
156 /* NB: Multiply sizeof char type (1 or 4) to get the number of
157 bytes. */
158 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
159 ret
160
161 .p2align 4
162L(first_vec_x2):
163# ifndef USE_AS_STRCHRNUL
164 /* Check to see if first match was CHAR (k0) or null (k1). */
165 kmovd %k0, %eax
166 tzcntl %eax, %eax
167 kmovd %k1, %ecx
168 /* bzhil will not be 0 if first match was null. */
169 bzhil %eax, %ecx, %ecx
170 jne L(zero)
171# else
172 /* Combine CHAR and null matches. */
173 kord %k0, %k1, %k0
174 kmovd %k0, %eax
175 tzcntl %eax, %eax
176# endif
177 /* NB: Multiply sizeof char type (1 or 4) to get the number of
178 bytes. */
179 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
180 ret
181
182 .p2align 4
183L(aligned_more):
184 /* Align data to VEC_SIZE. */
185 andq $-VEC_SIZE, %rdi
186L(cross_page_continue):
187 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
188 data is only aligned to VEC_SIZE. Use two alternating methods
189 for checking VEC to balance latency and port contention. */
190
191 /* This method has higher latency but has better port
192 distribution. */
193 VMOVA (VEC_SIZE)(%rdi), %YMM1
194 /* Leaves only CHARS matching esi as 0. */
195 vpxorq %YMM1, %YMM0, %YMM2
196 VPMINU %YMM2, %YMM1, %YMM2
197 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
198 VPCMP $0, %YMMZERO, %YMM2, %k0
199 kmovd %k0, %eax
200 testl %eax, %eax
201 jnz L(first_vec_x1)
202
203 /* This method has higher latency but has better port
204 distribution. */
205 VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
206 /* Each bit in K0 represents a CHAR in YMM1. */
207 VPCMP $0, %YMM1, %YMM0, %k0
208 /* Each bit in K1 represents a CHAR in YMM1. */
209 VPCMP $0, %YMM1, %YMMZERO, %k1
210 kortestd %k0, %k1
211 jnz L(first_vec_x2)
212
213 VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
214 /* Leaves only CHARS matching esi as 0. */
215 vpxorq %YMM1, %YMM0, %YMM2
216 VPMINU %YMM2, %YMM1, %YMM2
217 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
218 VPCMP $0, %YMMZERO, %YMM2, %k0
219 kmovd %k0, %eax
220 testl %eax, %eax
221 jnz L(first_vec_x3)
222
223 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
224 /* Each bit in K0 represents a CHAR in YMM1. */
225 VPCMP $0, %YMM1, %YMM0, %k0
226 /* Each bit in K1 represents a CHAR in YMM1. */
227 VPCMP $0, %YMM1, %YMMZERO, %k1
228 kortestd %k0, %k1
229 jnz L(first_vec_x4)
230
231 /* Align data to VEC_SIZE * 4 for the loop. */
232 addq $VEC_SIZE, %rdi
233 andq $-(VEC_SIZE * 4), %rdi
234
235 .p2align 4
236L(loop_4x_vec):
237 /* Check 4x VEC at a time. No penalty to imm32 offset with evex
238 encoding. */
239 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
240 VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
241 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
242 VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
243
244 /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
245 zero. */
246 vpxorq %YMM1, %YMM0, %YMM5
247 /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
248 k register. Its possible to save either 1 or 2 instructions
249 using cmp no equals method for either YMM1 or YMM1 and YMM3
250 respectively but bottleneck on p5 makes it not worth it. */
251 VPCMP $4, %YMM0, %YMM2, %k2
252 vpxorq %YMM3, %YMM0, %YMM7
253 VPCMP $4, %YMM0, %YMM4, %k4
254
255 /* Use min to select all zeros from either xor or end of string).
256 */
257 VPMINU %YMM1, %YMM5, %YMM1
258 VPMINU %YMM3, %YMM7, %YMM3
259
260 /* Use min + zeromask to select for zeros. Since k2 and k4 will
261 have 0 as positions that matched with CHAR which will set
262 zero in the corresponding destination bytes in YMM2 / YMM4.
263 */
264 VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
265 VPMINU %YMM3, %YMM4, %YMM4
266 VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
267
268 VPCMP $0, %YMMZERO, %YMM4, %k1
269 kmovd %k1, %ecx
270 subq $-(VEC_SIZE * 4), %rdi
271 testl %ecx, %ecx
272 jz L(loop_4x_vec)
273
274 VPCMP $0, %YMMZERO, %YMM1, %k0
275 kmovd %k0, %eax
276 testl %eax, %eax
277 jnz L(last_vec_x1)
278
279 VPCMP $0, %YMMZERO, %YMM2, %k0
280 kmovd %k0, %eax
281 testl %eax, %eax
282 jnz L(last_vec_x2)
283
284 VPCMP $0, %YMMZERO, %YMM3, %k0
285 kmovd %k0, %eax
286 /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
287# ifdef USE_AS_WCSCHR
288 sall $8, %ecx
289 orl %ecx, %eax
290 tzcntl %eax, %eax
291# else
292 salq $32, %rcx
293 orq %rcx, %rax
294 tzcntq %rax, %rax
295# endif
296# ifndef USE_AS_STRCHRNUL
297 /* Check if match was CHAR or null. */
298 cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
299 jne L(zero_end)
300# endif
301 /* NB: Multiply sizeof char type (1 or 4) to get the number of
302 bytes. */
303 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
304 ret
305
306# ifndef USE_AS_STRCHRNUL
307L(zero_end):
308 xorl %eax, %eax
309 ret
310# endif
311
312 .p2align 4
313L(last_vec_x1):
314 tzcntl %eax, %eax
315# ifndef USE_AS_STRCHRNUL
316 /* Check if match was null. */
317 cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
318 jne L(zero_end)
319# endif
320 /* NB: Multiply sizeof char type (1 or 4) to get the number of
321 bytes. */
322 leaq (%rdi, %rax, CHAR_SIZE), %rax
323 ret
324
325 .p2align 4
326L(last_vec_x2):
327 tzcntl %eax, %eax
328# ifndef USE_AS_STRCHRNUL
329 /* Check if match was null. */
330 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
331 jne L(zero_end)
332# endif
333 /* NB: Multiply sizeof char type (1 or 4) to get the number of
334 bytes. */
335 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
336 ret
337
338 /* Cold case for crossing page with first load. */
339 .p2align 4
340L(cross_page_boundary):
341 movq %rdi, %rdx
342 /* Align rdi. */
343 andq $-VEC_SIZE, %rdi
344 VMOVA (%rdi), %YMM1
345 /* Leaves only CHARS matching esi as 0. */
346 vpxorq %YMM1, %YMM0, %YMM2
347 VPMINU %YMM2, %YMM1, %YMM2
348 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
349 VPCMP $0, %YMMZERO, %YMM2, %k0
350 kmovd %k0, %eax
351 /* Remove the leading bits. */
352# ifdef USE_AS_WCSCHR
353 movl %edx, %SHIFT_REG
354 /* NB: Divide shift count by 4 since each bit in K1 represent 4
355 bytes. */
356 sarl $2, %SHIFT_REG
357 andl $(CHAR_PER_VEC - 1), %SHIFT_REG
358# endif
359 sarxl %SHIFT_REG, %eax, %eax
360 /* If eax is zero continue. */
361 testl %eax, %eax
362 jz L(cross_page_continue)
363 tzcntl %eax, %eax
364# ifndef USE_AS_STRCHRNUL
365 /* Check to see if match was CHAR or null. */
366 cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
367 jne L(zero_end)
368# endif
369# ifdef USE_AS_WCSCHR
370 /* NB: Multiply wchar_t count by 4 to get the number of
371 bytes. */
372 leaq (%rdx, %rax, CHAR_SIZE), %rax
373# else
374 addq %rdx, %rax
375# endif
376 ret
377
378END (STRCHR)
379# endif
380