1/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifndef STRRCHR
26# define STRRCHR __strrchr_evex
27# endif
28
29# define VMOVU vmovdqu64
30# define VMOVA vmovdqa64
31
32# ifdef USE_AS_WCSRCHR
33# define SHIFT_REG esi
34
35# define kunpck kunpckbw
36# define kmov_2x kmovd
37# define maskz_2x ecx
38# define maskm_2x eax
39# define CHAR_SIZE 4
40# define VPMIN vpminud
41# define VPTESTN vptestnmd
42# define VPBROADCAST vpbroadcastd
43# define VPCMP vpcmpd
44# else
45# define SHIFT_REG edi
46
47# define kunpck kunpckdq
48# define kmov_2x kmovq
49# define maskz_2x rcx
50# define maskm_2x rax
51
52# define CHAR_SIZE 1
53# define VPMIN vpminub
54# define VPTESTN vptestnmb
55# define VPBROADCAST vpbroadcastb
56# define VPCMP vpcmpb
57# endif
58
59# define XMMZERO xmm16
60# define YMMZERO ymm16
61# define YMMMATCH ymm17
62# define YMMSAVE ymm18
63
64# define YMM1 ymm19
65# define YMM2 ymm20
66# define YMM3 ymm21
67# define YMM4 ymm22
68# define YMM5 ymm23
69# define YMM6 ymm24
70# define YMM7 ymm25
71# define YMM8 ymm26
72
73
74# define VEC_SIZE 32
75# define PAGE_SIZE 4096
76 .section .text.evex, "ax", @progbits
77ENTRY(STRRCHR)
78 movl %edi, %eax
79 /* Broadcast CHAR to YMMMATCH. */
80 VPBROADCAST %esi, %YMMMATCH
81
82 andl $(PAGE_SIZE - 1), %eax
83 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
84 jg L(cross_page_boundary)
85
86L(page_cross_continue):
87 VMOVU (%rdi), %YMM1
88 /* k0 has a 1 for each zero CHAR in YMM1. */
89 VPTESTN %YMM1, %YMM1, %k0
90 kmovd %k0, %ecx
91 testl %ecx, %ecx
92 jz L(aligned_more)
93 /* fallthrough: zero CHAR in first VEC. */
94
95 /* K1 has a 1 for each search CHAR match in YMM1. */
96 VPCMP $0, %YMMMATCH, %YMM1, %k1
97 kmovd %k1, %eax
98 /* Build mask up until first zero CHAR (used to mask of
99 potential search CHAR matches past the end of the string).
100 */
101 blsmskl %ecx, %ecx
102 andl %ecx, %eax
103 jz L(ret0)
104 /* Get last match (the `andl` removed any out of bounds
105 matches). */
106 bsrl %eax, %eax
107# ifdef USE_AS_WCSRCHR
108 leaq (%rdi, %rax, CHAR_SIZE), %rax
109# else
110 addq %rdi, %rax
111# endif
112L(ret0):
113 ret
114
115 /* Returns for first vec x1/x2/x3 have hard coded backward
116 search path for earlier matches. */
117 .p2align 4,, 6
118L(first_vec_x1):
119 VPCMP $0, %YMMMATCH, %YMM2, %k1
120 kmovd %k1, %eax
121 blsmskl %ecx, %ecx
122 /* eax non-zero if search CHAR in range. */
123 andl %ecx, %eax
124 jnz L(first_vec_x1_return)
125
126 /* fallthrough: no match in YMM2 then need to check for earlier
127 matches (in YMM1). */
128 .p2align 4,, 4
129L(first_vec_x0_test):
130 VPCMP $0, %YMMMATCH, %YMM1, %k1
131 kmovd %k1, %eax
132 testl %eax, %eax
133 jz L(ret1)
134 bsrl %eax, %eax
135# ifdef USE_AS_WCSRCHR
136 leaq (%rsi, %rax, CHAR_SIZE), %rax
137# else
138 addq %rsi, %rax
139# endif
140L(ret1):
141 ret
142
143 .p2align 4,, 10
144L(first_vec_x1_or_x2):
145 VPCMP $0, %YMM3, %YMMMATCH, %k3
146 VPCMP $0, %YMM2, %YMMMATCH, %k2
147 /* K2 and K3 have 1 for any search CHAR match. Test if any
148 matches between either of them. Otherwise check YMM1. */
149 kortestd %k2, %k3
150 jz L(first_vec_x0_test)
151
152 /* Guranteed that YMM2 and YMM3 are within range so merge the
153 two bitmasks then get last result. */
154 kunpck %k2, %k3, %k3
155 kmovq %k3, %rax
156 bsrq %rax, %rax
157 leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
158 ret
159
160 .p2align 4,, 6
161L(first_vec_x3):
162 VPCMP $0, %YMMMATCH, %YMM4, %k1
163 kmovd %k1, %eax
164 blsmskl %ecx, %ecx
165 /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
166 andl %ecx, %eax
167 jz L(first_vec_x1_or_x2)
168 bsrl %eax, %eax
169 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
170 ret
171
172 .p2align 4,, 6
173L(first_vec_x0_x1_test):
174 VPCMP $0, %YMMMATCH, %YMM2, %k1
175 kmovd %k1, %eax
176 /* Check YMM2 for last match first. If no match try YMM1. */
177 testl %eax, %eax
178 jz L(first_vec_x0_test)
179 .p2align 4,, 4
180L(first_vec_x1_return):
181 bsrl %eax, %eax
182 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
183 ret
184
185 .p2align 4,, 10
186L(first_vec_x2):
187 VPCMP $0, %YMMMATCH, %YMM3, %k1
188 kmovd %k1, %eax
189 blsmskl %ecx, %ecx
190 /* Check YMM3 for last match first. If no match try YMM2/YMM1.
191 */
192 andl %ecx, %eax
193 jz L(first_vec_x0_x1_test)
194 bsrl %eax, %eax
195 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
196 ret
197
198
199 .p2align 4
200L(aligned_more):
201 /* Need to keep original pointer incase YMM1 has last match. */
202 movq %rdi, %rsi
203 andq $-VEC_SIZE, %rdi
204 VMOVU VEC_SIZE(%rdi), %YMM2
205 VPTESTN %YMM2, %YMM2, %k0
206 kmovd %k0, %ecx
207 testl %ecx, %ecx
208 jnz L(first_vec_x1)
209
210 VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
211 VPTESTN %YMM3, %YMM3, %k0
212 kmovd %k0, %ecx
213 testl %ecx, %ecx
214 jnz L(first_vec_x2)
215
216 VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
217 VPTESTN %YMM4, %YMM4, %k0
218 kmovd %k0, %ecx
219 movq %rdi, %r8
220 testl %ecx, %ecx
221 jnz L(first_vec_x3)
222
223 andq $-(VEC_SIZE * 2), %rdi
224 .p2align 4
225L(first_aligned_loop):
226 /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
227 they don't store a match. */
228 VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
229 VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
230
231 VPCMP $0, %YMM5, %YMMMATCH, %k2
232 vpxord %YMM6, %YMMMATCH, %YMM7
233
234 VPMIN %YMM5, %YMM6, %YMM8
235 VPMIN %YMM8, %YMM7, %YMM7
236
237 VPTESTN %YMM7, %YMM7, %k1
238 subq $(VEC_SIZE * -2), %rdi
239 kortestd %k1, %k2
240 jz L(first_aligned_loop)
241
242 VPCMP $0, %YMM6, %YMMMATCH, %k3
243 VPTESTN %YMM8, %YMM8, %k1
244 ktestd %k1, %k1
245 jz L(second_aligned_loop_prep)
246
247 kortestd %k2, %k3
248 jnz L(return_first_aligned_loop)
249
250 .p2align 4,, 6
251L(first_vec_x1_or_x2_or_x3):
252 VPCMP $0, %YMM4, %YMMMATCH, %k4
253 kmovd %k4, %eax
254 testl %eax, %eax
255 jz L(first_vec_x1_or_x2)
256 bsrl %eax, %eax
257 leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
258 ret
259
260 .p2align 4,, 8
261L(return_first_aligned_loop):
262 VPTESTN %YMM5, %YMM5, %k0
263 kunpck %k0, %k1, %k0
264 kmov_2x %k0, %maskz_2x
265
266 blsmsk %maskz_2x, %maskz_2x
267 kunpck %k2, %k3, %k3
268 kmov_2x %k3, %maskm_2x
269 and %maskz_2x, %maskm_2x
270 jz L(first_vec_x1_or_x2_or_x3)
271
272 bsr %maskm_2x, %maskm_2x
273 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
274 ret
275
276 .p2align 4
277 /* We can throw away the work done for the first 4x checks here
278 as we have a later match. This is the 'fast' path persay.
279 */
280L(second_aligned_loop_prep):
281L(second_aligned_loop_set_furthest_match):
282 movq %rdi, %rsi
283 kunpck %k2, %k3, %k4
284
285 .p2align 4
286L(second_aligned_loop):
287 VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
288 VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
289
290 VPCMP $0, %YMM1, %YMMMATCH, %k2
291 vpxord %YMM2, %YMMMATCH, %YMM3
292
293 VPMIN %YMM1, %YMM2, %YMM4
294 VPMIN %YMM3, %YMM4, %YMM3
295
296 VPTESTN %YMM3, %YMM3, %k1
297 subq $(VEC_SIZE * -2), %rdi
298 kortestd %k1, %k2
299 jz L(second_aligned_loop)
300
301 VPCMP $0, %YMM2, %YMMMATCH, %k3
302 VPTESTN %YMM4, %YMM4, %k1
303 ktestd %k1, %k1
304 jz L(second_aligned_loop_set_furthest_match)
305
306 kortestd %k2, %k3
307 /* branch here because there is a significant advantage interms
308 of output dependency chance in using edx. */
309 jnz L(return_new_match)
310L(return_old_match):
311 kmovq %k4, %rax
312 bsrq %rax, %rax
313 leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
314 ret
315
316L(return_new_match):
317 VPTESTN %YMM1, %YMM1, %k0
318 kunpck %k0, %k1, %k0
319 kmov_2x %k0, %maskz_2x
320
321 blsmsk %maskz_2x, %maskz_2x
322 kunpck %k2, %k3, %k3
323 kmov_2x %k3, %maskm_2x
324 and %maskz_2x, %maskm_2x
325 jz L(return_old_match)
326
327 bsr %maskm_2x, %maskm_2x
328 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
329 ret
330
331L(cross_page_boundary):
332 /* eax contains all the page offset bits of src (rdi). `xor rdi,
333 rax` sets pointer will all page offset bits cleared so
334 offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
335 before page cross (guranteed to be safe to read). Doing this
336 as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
337 a bit of code size. */
338 xorq %rdi, %rax
339 VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
340 VPTESTN %YMM1, %YMM1, %k0
341 kmovd %k0, %ecx
342
343 /* Shift out zero CHAR matches that are before the begining of
344 src (rdi). */
345# ifdef USE_AS_WCSRCHR
346 movl %edi, %esi
347 andl $(VEC_SIZE - 1), %esi
348 shrl $2, %esi
349# endif
350 shrxl %SHIFT_REG, %ecx, %ecx
351
352 testl %ecx, %ecx
353 jz L(page_cross_continue)
354
355 /* Found zero CHAR so need to test for search CHAR. */
356 VPCMP $0, %YMMMATCH, %YMM1, %k1
357 kmovd %k1, %eax
358 /* Shift out search CHAR matches that are before the begining of
359 src (rdi). */
360 shrxl %SHIFT_REG, %eax, %eax
361
362 /* Check if any search CHAR match in range. */
363 blsmskl %ecx, %ecx
364 andl %ecx, %eax
365 jz L(ret3)
366 bsrl %eax, %eax
367# ifdef USE_AS_WCSRCHR
368 leaq (%rdi, %rax, CHAR_SIZE), %rax
369# else
370 addq %rdi, %rax
371# endif
372L(ret3):
373 ret
374
375END(STRRCHR)
376#endif
377