1 | /* strrchr/wcsrchr optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef STRRCHR |
26 | # define STRRCHR __strrchr_evex |
27 | # endif |
28 | |
29 | # include "x86-evex256-vecs.h" |
30 | |
31 | # ifdef USE_AS_WCSRCHR |
32 | # define SHIFT_REG rsi |
33 | # define kunpck_2x kunpckbw |
34 | # define kmov_2x kmovd |
35 | # define maskz_2x ecx |
36 | # define maskm_2x eax |
37 | # define CHAR_SIZE 4 |
38 | # define VPMIN vpminud |
39 | # define VPTESTN vptestnmd |
40 | # define VPTEST vptestmd |
41 | # define VPBROADCAST vpbroadcastd |
42 | # define VPCMPEQ vpcmpeqd |
43 | # define VPCMP vpcmpd |
44 | |
45 | # define USE_WIDE_CHAR |
46 | # else |
47 | # define SHIFT_REG rdi |
48 | # define kunpck_2x kunpckdq |
49 | # define kmov_2x kmovq |
50 | # define maskz_2x rcx |
51 | # define maskm_2x rax |
52 | |
53 | # define CHAR_SIZE 1 |
54 | # define VPMIN vpminub |
55 | # define VPTESTN vptestnmb |
56 | # define VPTEST vptestmb |
57 | # define VPBROADCAST vpbroadcastb |
58 | # define VPCMPEQ vpcmpeqb |
59 | # define VPCMP vpcmpb |
60 | # endif |
61 | |
62 | # include "reg-macros.h" |
63 | |
64 | # define VMATCH VMM(0) |
65 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
66 | # define PAGE_SIZE 4096 |
67 | |
68 | .section SECTION(.text), "ax" , @progbits |
69 | ENTRY_P2ALIGN(STRRCHR, 6) |
70 | movl %edi, %eax |
71 | /* Broadcast CHAR to VMATCH. */ |
72 | VPBROADCAST %esi, %VMATCH |
73 | |
74 | andl $(PAGE_SIZE - 1), %eax |
75 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
76 | jg L(cross_page_boundary) |
77 | L(page_cross_continue): |
78 | VMOVU (%rdi), %VMM(1) |
79 | /* k0 has a 1 for each zero CHAR in VEC(1). */ |
80 | VPTESTN %VMM(1), %VMM(1), %k0 |
81 | KMOV %k0, %VRSI |
82 | test %VRSI, %VRSI |
83 | jz L(aligned_more) |
84 | /* fallthrough: zero CHAR in first VEC. */ |
85 | /* K1 has a 1 for each search CHAR match in VEC(1). */ |
86 | VPCMPEQ %VMATCH, %VMM(1), %k1 |
87 | KMOV %k1, %VRAX |
88 | /* Build mask up until first zero CHAR (used to mask of |
89 | potential search CHAR matches past the end of the string). |
90 | */ |
91 | blsmsk %VRSI, %VRSI |
92 | and %VRSI, %VRAX |
93 | jz L(ret0) |
94 | /* Get last match (the `and` removed any out of bounds matches). |
95 | */ |
96 | bsr %VRAX, %VRAX |
97 | # ifdef USE_AS_WCSRCHR |
98 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
99 | # else |
100 | addq %rdi, %rax |
101 | # endif |
102 | L(ret0): |
103 | ret |
104 | |
105 | /* Returns for first vec x1/x2/x3 have hard coded backward |
106 | search path for earlier matches. */ |
107 | .p2align 4,, 6 |
108 | L(first_vec_x1): |
109 | VPCMPEQ %VMATCH, %VMM(2), %k1 |
110 | KMOV %k1, %VRAX |
111 | blsmsk %VRCX, %VRCX |
112 | /* eax non-zero if search CHAR in range. */ |
113 | and %VRCX, %VRAX |
114 | jnz L(first_vec_x1_return) |
115 | |
116 | /* fallthrough: no match in VEC(2) then need to check for |
117 | earlier matches (in VEC(1)). */ |
118 | .p2align 4,, 4 |
119 | L(first_vec_x0_test): |
120 | VPCMPEQ %VMATCH, %VMM(1), %k1 |
121 | KMOV %k1, %VRAX |
122 | test %VRAX, %VRAX |
123 | jz L(ret1) |
124 | bsr %VRAX, %VRAX |
125 | # ifdef USE_AS_WCSRCHR |
126 | leaq (%rsi, %rax, CHAR_SIZE), %rax |
127 | # else |
128 | addq %rsi, %rax |
129 | # endif |
130 | L(ret1): |
131 | ret |
132 | |
133 | .p2align 4,, 10 |
134 | L(first_vec_x1_or_x2): |
135 | VPCMPEQ %VMM(3), %VMATCH, %k3 |
136 | VPCMPEQ %VMM(2), %VMATCH, %k2 |
137 | /* K2 and K3 have 1 for any search CHAR match. Test if any |
138 | matches between either of them. Otherwise check VEC(1). */ |
139 | KORTEST %k2, %k3 |
140 | jz L(first_vec_x0_test) |
141 | |
142 | /* Guaranteed that VEC(2) and VEC(3) are within range so merge |
143 | the two bitmasks then get last result. */ |
144 | kunpck_2x %k2, %k3, %k3 |
145 | kmov_2x %k3, %maskm_2x |
146 | bsr %maskm_2x, %maskm_2x |
147 | leaq (VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax |
148 | ret |
149 | |
150 | .p2align 4,, 7 |
151 | L(first_vec_x3): |
152 | VPCMPEQ %VMATCH, %VMM(4), %k1 |
153 | KMOV %k1, %VRAX |
154 | blsmsk %VRCX, %VRCX |
155 | /* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3). |
156 | */ |
157 | and %VRCX, %VRAX |
158 | jz L(first_vec_x1_or_x2) |
159 | bsr %VRAX, %VRAX |
160 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
161 | ret |
162 | |
163 | |
164 | .p2align 4,, 6 |
165 | L(first_vec_x0_x1_test): |
166 | VPCMPEQ %VMATCH, %VMM(2), %k1 |
167 | KMOV %k1, %VRAX |
168 | /* Check VEC(2) for last match first. If no match try VEC(1). |
169 | */ |
170 | test %VRAX, %VRAX |
171 | jz L(first_vec_x0_test) |
172 | .p2align 4,, 4 |
173 | L(first_vec_x1_return): |
174 | bsr %VRAX, %VRAX |
175 | leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax |
176 | ret |
177 | |
178 | |
179 | .p2align 4,, 10 |
180 | L(first_vec_x2): |
181 | VPCMPEQ %VMATCH, %VMM(3), %k1 |
182 | KMOV %k1, %VRAX |
183 | blsmsk %VRCX, %VRCX |
184 | /* Check VEC(3) for last match first. If no match try |
185 | VEC(2)/VEC(1). */ |
186 | and %VRCX, %VRAX |
187 | jz L(first_vec_x0_x1_test) |
188 | bsr %VRAX, %VRAX |
189 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
190 | ret |
191 | |
192 | |
193 | .p2align 4,, 12 |
194 | L(aligned_more): |
195 | /* Need to keep original pointer in case VEC(1) has last match. |
196 | */ |
197 | movq %rdi, %rsi |
198 | andq $-VEC_SIZE, %rdi |
199 | |
200 | VMOVU VEC_SIZE(%rdi), %VMM(2) |
201 | VPTESTN %VMM(2), %VMM(2), %k0 |
202 | KMOV %k0, %VRCX |
203 | |
204 | test %VRCX, %VRCX |
205 | jnz L(first_vec_x1) |
206 | |
207 | VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3) |
208 | VPTESTN %VMM(3), %VMM(3), %k0 |
209 | KMOV %k0, %VRCX |
210 | |
211 | test %VRCX, %VRCX |
212 | jnz L(first_vec_x2) |
213 | |
214 | VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4) |
215 | VPTESTN %VMM(4), %VMM(4), %k0 |
216 | KMOV %k0, %VRCX |
217 | movq %rdi, %r8 |
218 | test %VRCX, %VRCX |
219 | jnz L(first_vec_x3) |
220 | |
221 | andq $-(VEC_SIZE * 2), %rdi |
222 | .p2align 4,, 10 |
223 | L(first_aligned_loop): |
224 | /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can |
225 | guarantee they don't store a match. */ |
226 | VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5) |
227 | VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6) |
228 | |
229 | VPCMPEQ %VMM(5), %VMATCH, %k2 |
230 | vpxord %VMM(6), %VMATCH, %VMM(7) |
231 | |
232 | VPMIN %VMM(5), %VMM(6), %VMM(8) |
233 | VPMIN %VMM(8), %VMM(7), %VMM(7) |
234 | |
235 | VPTESTN %VMM(7), %VMM(7), %k1 |
236 | subq $(VEC_SIZE * -2), %rdi |
237 | KORTEST %k1, %k2 |
238 | jz L(first_aligned_loop) |
239 | |
240 | VPCMPEQ %VMM(6), %VMATCH, %k3 |
241 | VPTESTN %VMM(8), %VMM(8), %k1 |
242 | |
243 | /* If k1 is zero, then we found a CHAR match but no null-term. |
244 | We can now safely throw out VEC1-4. */ |
245 | KTEST %k1, %k1 |
246 | jz L(second_aligned_loop_prep) |
247 | |
248 | KORTEST %k2, %k3 |
249 | jnz L(return_first_aligned_loop) |
250 | |
251 | |
252 | .p2align 4,, 6 |
253 | L(first_vec_x1_or_x2_or_x3): |
254 | VPCMPEQ %VMM(4), %VMATCH, %k4 |
255 | KMOV %k4, %VRAX |
256 | bsr %VRAX, %VRAX |
257 | jz L(first_vec_x1_or_x2) |
258 | leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax |
259 | ret |
260 | |
261 | |
262 | .p2align 4,, 8 |
263 | L(return_first_aligned_loop): |
264 | VPTESTN %VMM(5), %VMM(5), %k0 |
265 | |
266 | /* Combined results from VEC5/6. */ |
267 | kunpck_2x %k0, %k1, %k0 |
268 | kmov_2x %k0, %maskz_2x |
269 | |
270 | blsmsk %maskz_2x, %maskz_2x |
271 | kunpck_2x %k2, %k3, %k3 |
272 | kmov_2x %k3, %maskm_2x |
273 | and %maskz_2x, %maskm_2x |
274 | jz L(first_vec_x1_or_x2_or_x3) |
275 | |
276 | bsr %maskm_2x, %maskm_2x |
277 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
278 | ret |
279 | |
280 | .p2align 4 |
281 | /* We can throw away the work done for the first 4x checks here |
282 | as we have a later match. This is the 'fast' path persay. |
283 | */ |
284 | L(second_aligned_loop_prep): |
285 | L(second_aligned_loop_set_furthest_match): |
286 | movq %rdi, %rsi |
287 | /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on |
288 | port0 and have noticeable overhead in the loop. */ |
289 | VMOVA %VMM(5), %VMM(7) |
290 | VMOVA %VMM(6), %VMM(8) |
291 | .p2align 4 |
292 | L(second_aligned_loop): |
293 | VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5) |
294 | VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6) |
295 | VPCMPEQ %VMM(5), %VMATCH, %k2 |
296 | vpxord %VMM(6), %VMATCH, %VMM(3) |
297 | |
298 | VPMIN %VMM(5), %VMM(6), %VMM(4) |
299 | VPMIN %VMM(3), %VMM(4), %VMM(3) |
300 | |
301 | VPTESTN %VMM(3), %VMM(3), %k1 |
302 | subq $(VEC_SIZE * -2), %rdi |
303 | KORTEST %k1, %k2 |
304 | jz L(second_aligned_loop) |
305 | VPCMPEQ %VMM(6), %VMATCH, %k3 |
306 | VPTESTN %VMM(4), %VMM(4), %k1 |
307 | KTEST %k1, %k1 |
308 | jz L(second_aligned_loop_set_furthest_match) |
309 | |
310 | /* branch here because we know we have a match in VEC7/8 but |
311 | might not in VEC5/6 so the latter is expected to be less |
312 | likely. */ |
313 | KORTEST %k2, %k3 |
314 | jnz L(return_new_match) |
315 | |
316 | L(return_old_match): |
317 | VPCMPEQ %VMM(8), %VMATCH, %k0 |
318 | KMOV %k0, %VRCX |
319 | bsr %VRCX, %VRCX |
320 | jnz L(return_old_match_ret) |
321 | |
322 | VPCMPEQ %VMM(7), %VMATCH, %k0 |
323 | KMOV %k0, %VRCX |
324 | bsr %VRCX, %VRCX |
325 | subq $VEC_SIZE, %rsi |
326 | L(return_old_match_ret): |
327 | leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax |
328 | ret |
329 | |
330 | .p2align 4,, 10 |
331 | L(return_new_match): |
332 | VPTESTN %VMM(5), %VMM(5), %k0 |
333 | |
334 | /* Combined results from VEC5/6. */ |
335 | kunpck_2x %k0, %k1, %k0 |
336 | kmov_2x %k0, %maskz_2x |
337 | |
338 | blsmsk %maskz_2x, %maskz_2x |
339 | kunpck_2x %k2, %k3, %k3 |
340 | kmov_2x %k3, %maskm_2x |
341 | |
342 | /* Match at end was out-of-bounds so use last known match. */ |
343 | and %maskz_2x, %maskm_2x |
344 | jz L(return_old_match) |
345 | |
346 | bsr %maskm_2x, %maskm_2x |
347 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
348 | ret |
349 | |
350 | L(cross_page_boundary): |
351 | /* eax contains all the page offset bits of src (rdi). `xor rdi, |
352 | rax` sets pointer will all page offset bits cleared so |
353 | offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC |
354 | before page cross (guaranteed to be safe to read). Doing this |
355 | as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves |
356 | a bit of code size. */ |
357 | xorq %rdi, %rax |
358 | VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) |
359 | VPTESTN %VMM(1), %VMM(1), %k0 |
360 | KMOV %k0, %VRCX |
361 | |
362 | /* Shift out zero CHAR matches that are before the beginning of |
363 | src (rdi). */ |
364 | # ifdef USE_AS_WCSRCHR |
365 | movl %edi, %esi |
366 | andl $(VEC_SIZE - 1), %esi |
367 | shrl $2, %esi |
368 | # endif |
369 | shrx %VGPR(SHIFT_REG), %VRCX, %VRCX |
370 | |
371 | test %VRCX, %VRCX |
372 | jz L(page_cross_continue) |
373 | |
374 | /* Found zero CHAR so need to test for search CHAR. */ |
375 | VPCMP $0, %VMATCH, %VMM(1), %k1 |
376 | KMOV %k1, %VRAX |
377 | /* Shift out search CHAR matches that are before the beginning of |
378 | src (rdi). */ |
379 | shrx %VGPR(SHIFT_REG), %VRAX, %VRAX |
380 | |
381 | /* Check if any search CHAR match in range. */ |
382 | blsmsk %VRCX, %VRCX |
383 | and %VRCX, %VRAX |
384 | jz L(ret3) |
385 | bsr %VRAX, %VRAX |
386 | # ifdef USE_AS_WCSRCHR |
387 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
388 | # else |
389 | addq %rdi, %rax |
390 | # endif |
391 | L(ret3): |
392 | ret |
393 | END(STRRCHR) |
394 | #endif |
395 | |