1/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifndef STRRCHR
26# define STRRCHR __strrchr_evex
27# endif
28
29# include "x86-evex256-vecs.h"
30
31# ifdef USE_AS_WCSRCHR
32# define SHIFT_REG rsi
33# define kunpck_2x kunpckbw
34# define kmov_2x kmovd
35# define maskz_2x ecx
36# define maskm_2x eax
37# define CHAR_SIZE 4
38# define VPMIN vpminud
39# define VPTESTN vptestnmd
40# define VPTEST vptestmd
41# define VPBROADCAST vpbroadcastd
42# define VPCMPEQ vpcmpeqd
43# define VPCMP vpcmpd
44
45# define USE_WIDE_CHAR
46# else
47# define SHIFT_REG rdi
48# define kunpck_2x kunpckdq
49# define kmov_2x kmovq
50# define maskz_2x rcx
51# define maskm_2x rax
52
53# define CHAR_SIZE 1
54# define VPMIN vpminub
55# define VPTESTN vptestnmb
56# define VPTEST vptestmb
57# define VPBROADCAST vpbroadcastb
58# define VPCMPEQ vpcmpeqb
59# define VPCMP vpcmpb
60# endif
61
62# include "reg-macros.h"
63
64# define VMATCH VMM(0)
65# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
66# define PAGE_SIZE 4096
67
68 .section SECTION(.text), "ax", @progbits
69ENTRY_P2ALIGN(STRRCHR, 6)
70 movl %edi, %eax
71 /* Broadcast CHAR to VMATCH. */
72 VPBROADCAST %esi, %VMATCH
73
74 andl $(PAGE_SIZE - 1), %eax
75 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
76 jg L(cross_page_boundary)
77L(page_cross_continue):
78 VMOVU (%rdi), %VMM(1)
79 /* k0 has a 1 for each zero CHAR in VEC(1). */
80 VPTESTN %VMM(1), %VMM(1), %k0
81 KMOV %k0, %VRSI
82 test %VRSI, %VRSI
83 jz L(aligned_more)
84 /* fallthrough: zero CHAR in first VEC. */
85 /* K1 has a 1 for each search CHAR match in VEC(1). */
86 VPCMPEQ %VMATCH, %VMM(1), %k1
87 KMOV %k1, %VRAX
88 /* Build mask up until first zero CHAR (used to mask of
89 potential search CHAR matches past the end of the string).
90 */
91 blsmsk %VRSI, %VRSI
92 and %VRSI, %VRAX
93 jz L(ret0)
94 /* Get last match (the `and` removed any out of bounds matches).
95 */
96 bsr %VRAX, %VRAX
97# ifdef USE_AS_WCSRCHR
98 leaq (%rdi, %rax, CHAR_SIZE), %rax
99# else
100 addq %rdi, %rax
101# endif
102L(ret0):
103 ret
104
105 /* Returns for first vec x1/x2/x3 have hard coded backward
106 search path for earlier matches. */
107 .p2align 4,, 6
108L(first_vec_x1):
109 VPCMPEQ %VMATCH, %VMM(2), %k1
110 KMOV %k1, %VRAX
111 blsmsk %VRCX, %VRCX
112 /* eax non-zero if search CHAR in range. */
113 and %VRCX, %VRAX
114 jnz L(first_vec_x1_return)
115
116 /* fallthrough: no match in VEC(2) then need to check for
117 earlier matches (in VEC(1)). */
118 .p2align 4,, 4
119L(first_vec_x0_test):
120 VPCMPEQ %VMATCH, %VMM(1), %k1
121 KMOV %k1, %VRAX
122 test %VRAX, %VRAX
123 jz L(ret1)
124 bsr %VRAX, %VRAX
125# ifdef USE_AS_WCSRCHR
126 leaq (%rsi, %rax, CHAR_SIZE), %rax
127# else
128 addq %rsi, %rax
129# endif
130L(ret1):
131 ret
132
133 .p2align 4,, 10
134L(first_vec_x1_or_x2):
135 VPCMPEQ %VMM(3), %VMATCH, %k3
136 VPCMPEQ %VMM(2), %VMATCH, %k2
137 /* K2 and K3 have 1 for any search CHAR match. Test if any
138 matches between either of them. Otherwise check VEC(1). */
139 KORTEST %k2, %k3
140 jz L(first_vec_x0_test)
141
142 /* Guranteed that VEC(2) and VEC(3) are within range so merge
143 the two bitmasks then get last result. */
144 kunpck_2x %k2, %k3, %k3
145 kmov_2x %k3, %maskm_2x
146 bsr %maskm_2x, %maskm_2x
147 leaq (VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
148 ret
149
150 .p2align 4,, 7
151L(first_vec_x3):
152 VPCMPEQ %VMATCH, %VMM(4), %k1
153 KMOV %k1, %VRAX
154 blsmsk %VRCX, %VRCX
155 /* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
156 */
157 and %VRCX, %VRAX
158 jz L(first_vec_x1_or_x2)
159 bsr %VRAX, %VRAX
160 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
161 ret
162
163
164 .p2align 4,, 6
165L(first_vec_x0_x1_test):
166 VPCMPEQ %VMATCH, %VMM(2), %k1
167 KMOV %k1, %VRAX
168 /* Check VEC(2) for last match first. If no match try VEC(1).
169 */
170 test %VRAX, %VRAX
171 jz L(first_vec_x0_test)
172 .p2align 4,, 4
173L(first_vec_x1_return):
174 bsr %VRAX, %VRAX
175 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
176 ret
177
178
179 .p2align 4,, 10
180L(first_vec_x2):
181 VPCMPEQ %VMATCH, %VMM(3), %k1
182 KMOV %k1, %VRAX
183 blsmsk %VRCX, %VRCX
184 /* Check VEC(3) for last match first. If no match try
185 VEC(2)/VEC(1). */
186 and %VRCX, %VRAX
187 jz L(first_vec_x0_x1_test)
188 bsr %VRAX, %VRAX
189 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
190 ret
191
192
193 .p2align 4,, 12
194L(aligned_more):
195 /* Need to keep original pointer incase VEC(1) has last match.
196 */
197 movq %rdi, %rsi
198 andq $-VEC_SIZE, %rdi
199
200 VMOVU VEC_SIZE(%rdi), %VMM(2)
201 VPTESTN %VMM(2), %VMM(2), %k0
202 KMOV %k0, %VRCX
203
204 test %VRCX, %VRCX
205 jnz L(first_vec_x1)
206
207 VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3)
208 VPTESTN %VMM(3), %VMM(3), %k0
209 KMOV %k0, %VRCX
210
211 test %VRCX, %VRCX
212 jnz L(first_vec_x2)
213
214 VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4)
215 VPTESTN %VMM(4), %VMM(4), %k0
216 KMOV %k0, %VRCX
217 movq %rdi, %r8
218 test %VRCX, %VRCX
219 jnz L(first_vec_x3)
220
221 andq $-(VEC_SIZE * 2), %rdi
222 .p2align 4,, 10
223L(first_aligned_loop):
224 /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
225 gurantee they don't store a match. */
226 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5)
227 VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6)
228
229 VPCMPEQ %VMM(5), %VMATCH, %k2
230 vpxord %VMM(6), %VMATCH, %VMM(7)
231
232 VPMIN %VMM(5), %VMM(6), %VMM(8)
233 VPMIN %VMM(8), %VMM(7), %VMM(7)
234
235 VPTESTN %VMM(7), %VMM(7), %k1
236 subq $(VEC_SIZE * -2), %rdi
237 KORTEST %k1, %k2
238 jz L(first_aligned_loop)
239
240 VPCMPEQ %VMM(6), %VMATCH, %k3
241 VPTESTN %VMM(8), %VMM(8), %k1
242
243 /* If k1 is zero, then we found a CHAR match but no null-term.
244 We can now safely throw out VEC1-4. */
245 KTEST %k1, %k1
246 jz L(second_aligned_loop_prep)
247
248 KORTEST %k2, %k3
249 jnz L(return_first_aligned_loop)
250
251
252 .p2align 4,, 6
253L(first_vec_x1_or_x2_or_x3):
254 VPCMPEQ %VMM(4), %VMATCH, %k4
255 KMOV %k4, %VRAX
256 bsr %VRAX, %VRAX
257 jz L(first_vec_x1_or_x2)
258 leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
259 ret
260
261
262 .p2align 4,, 8
263L(return_first_aligned_loop):
264 VPTESTN %VMM(5), %VMM(5), %k0
265
266 /* Combined results from VEC5/6. */
267 kunpck_2x %k0, %k1, %k0
268 kmov_2x %k0, %maskz_2x
269
270 blsmsk %maskz_2x, %maskz_2x
271 kunpck_2x %k2, %k3, %k3
272 kmov_2x %k3, %maskm_2x
273 and %maskz_2x, %maskm_2x
274 jz L(first_vec_x1_or_x2_or_x3)
275
276 bsr %maskm_2x, %maskm_2x
277 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
278 ret
279
280 .p2align 4
281 /* We can throw away the work done for the first 4x checks here
282 as we have a later match. This is the 'fast' path persay.
283 */
284L(second_aligned_loop_prep):
285L(second_aligned_loop_set_furthest_match):
286 movq %rdi, %rsi
287 /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
288 port0 and have noticable overhead in the loop. */
289 VMOVA %VMM(5), %VMM(7)
290 VMOVA %VMM(6), %VMM(8)
291 .p2align 4
292L(second_aligned_loop):
293 VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5)
294 VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6)
295 VPCMPEQ %VMM(5), %VMATCH, %k2
296 vpxord %VMM(6), %VMATCH, %VMM(3)
297
298 VPMIN %VMM(5), %VMM(6), %VMM(4)
299 VPMIN %VMM(3), %VMM(4), %VMM(3)
300
301 VPTESTN %VMM(3), %VMM(3), %k1
302 subq $(VEC_SIZE * -2), %rdi
303 KORTEST %k1, %k2
304 jz L(second_aligned_loop)
305 VPCMPEQ %VMM(6), %VMATCH, %k3
306 VPTESTN %VMM(4), %VMM(4), %k1
307 KTEST %k1, %k1
308 jz L(second_aligned_loop_set_furthest_match)
309
310 /* branch here because we know we have a match in VEC7/8 but
311 might not in VEC5/6 so the latter is expected to be less
312 likely. */
313 KORTEST %k2, %k3
314 jnz L(return_new_match)
315
316L(return_old_match):
317 VPCMPEQ %VMM(8), %VMATCH, %k0
318 KMOV %k0, %VRCX
319 bsr %VRCX, %VRCX
320 jnz L(return_old_match_ret)
321
322 VPCMPEQ %VMM(7), %VMATCH, %k0
323 KMOV %k0, %VRCX
324 bsr %VRCX, %VRCX
325 subq $VEC_SIZE, %rsi
326L(return_old_match_ret):
327 leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
328 ret
329
330 .p2align 4,, 10
331L(return_new_match):
332 VPTESTN %VMM(5), %VMM(5), %k0
333
334 /* Combined results from VEC5/6. */
335 kunpck_2x %k0, %k1, %k0
336 kmov_2x %k0, %maskz_2x
337
338 blsmsk %maskz_2x, %maskz_2x
339 kunpck_2x %k2, %k3, %k3
340 kmov_2x %k3, %maskm_2x
341
342 /* Match at end was out-of-bounds so use last known match. */
343 and %maskz_2x, %maskm_2x
344 jz L(return_old_match)
345
346 bsr %maskm_2x, %maskm_2x
347 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
348 ret
349
350L(cross_page_boundary):
351 /* eax contains all the page offset bits of src (rdi). `xor rdi,
352 rax` sets pointer will all page offset bits cleared so
353 offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
354 before page cross (guranteed to be safe to read). Doing this
355 as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
356 a bit of code size. */
357 xorq %rdi, %rax
358 VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
359 VPTESTN %VMM(1), %VMM(1), %k0
360 KMOV %k0, %VRCX
361
362 /* Shift out zero CHAR matches that are before the begining of
363 src (rdi). */
364# ifdef USE_AS_WCSRCHR
365 movl %edi, %esi
366 andl $(VEC_SIZE - 1), %esi
367 shrl $2, %esi
368# endif
369 shrx %VGPR(SHIFT_REG), %VRCX, %VRCX
370
371 test %VRCX, %VRCX
372 jz L(page_cross_continue)
373
374 /* Found zero CHAR so need to test for search CHAR. */
375 VPCMP $0, %VMATCH, %VMM(1), %k1
376 KMOV %k1, %VRAX
377 /* Shift out search CHAR matches that are before the begining of
378 src (rdi). */
379 shrx %VGPR(SHIFT_REG), %VRAX, %VRAX
380
381 /* Check if any search CHAR match in range. */
382 blsmsk %VRCX, %VRCX
383 and %VRCX, %VRAX
384 jz L(ret3)
385 bsr %VRAX, %VRAX
386# ifdef USE_AS_WCSRCHR
387 leaq (%rdi, %rax, CHAR_SIZE), %rax
388# else
389 addq %rdi, %rax
390# endif
391L(ret3):
392 ret
393END(STRRCHR)
394#endif
395