1 | /* strrchr/wcsrchr optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef STRRCHR |
26 | # define STRRCHR __strrchr_evex |
27 | # endif |
28 | |
29 | # define VMOVU vmovdqu64 |
30 | # define VMOVA vmovdqa64 |
31 | |
32 | # ifdef USE_AS_WCSRCHR |
33 | # define SHIFT_REG esi |
34 | |
35 | # define kunpck kunpckbw |
36 | # define kmov_2x kmovd |
37 | # define maskz_2x ecx |
38 | # define maskm_2x eax |
39 | # define CHAR_SIZE 4 |
40 | # define VPMIN vpminud |
41 | # define VPTESTN vptestnmd |
42 | # define VPBROADCAST vpbroadcastd |
43 | # define VPCMP vpcmpd |
44 | # else |
45 | # define SHIFT_REG edi |
46 | |
47 | # define kunpck kunpckdq |
48 | # define kmov_2x kmovq |
49 | # define maskz_2x rcx |
50 | # define maskm_2x rax |
51 | |
52 | # define CHAR_SIZE 1 |
53 | # define VPMIN vpminub |
54 | # define VPTESTN vptestnmb |
55 | # define VPBROADCAST vpbroadcastb |
56 | # define VPCMP vpcmpb |
57 | # endif |
58 | |
59 | # define XMMZERO xmm16 |
60 | # define YMMZERO ymm16 |
61 | # define YMMMATCH ymm17 |
62 | # define YMMSAVE ymm18 |
63 | |
64 | # define YMM1 ymm19 |
65 | # define YMM2 ymm20 |
66 | # define YMM3 ymm21 |
67 | # define YMM4 ymm22 |
68 | # define YMM5 ymm23 |
69 | # define YMM6 ymm24 |
70 | # define YMM7 ymm25 |
71 | # define YMM8 ymm26 |
72 | |
73 | |
74 | # define VEC_SIZE 32 |
75 | # define PAGE_SIZE 4096 |
76 | .section .text.evex, "ax" , @progbits |
77 | ENTRY(STRRCHR) |
78 | movl %edi, %eax |
79 | /* Broadcast CHAR to YMMMATCH. */ |
80 | VPBROADCAST %esi, %YMMMATCH |
81 | |
82 | andl $(PAGE_SIZE - 1), %eax |
83 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
84 | jg L(cross_page_boundary) |
85 | |
86 | L(page_cross_continue): |
87 | VMOVU (%rdi), %YMM1 |
88 | /* k0 has a 1 for each zero CHAR in YMM1. */ |
89 | VPTESTN %YMM1, %YMM1, %k0 |
90 | kmovd %k0, %ecx |
91 | testl %ecx, %ecx |
92 | jz L(aligned_more) |
93 | /* fallthrough: zero CHAR in first VEC. */ |
94 | |
95 | /* K1 has a 1 for each search CHAR match in YMM1. */ |
96 | VPCMP $0, %YMMMATCH, %YMM1, %k1 |
97 | kmovd %k1, %eax |
98 | /* Build mask up until first zero CHAR (used to mask of |
99 | potential search CHAR matches past the end of the string). |
100 | */ |
101 | blsmskl %ecx, %ecx |
102 | andl %ecx, %eax |
103 | jz L(ret0) |
104 | /* Get last match (the `andl` removed any out of bounds |
105 | matches). */ |
106 | bsrl %eax, %eax |
107 | # ifdef USE_AS_WCSRCHR |
108 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
109 | # else |
110 | addq %rdi, %rax |
111 | # endif |
112 | L(ret0): |
113 | ret |
114 | |
115 | /* Returns for first vec x1/x2/x3 have hard coded backward |
116 | search path for earlier matches. */ |
117 | .p2align 4,, 6 |
118 | L(first_vec_x1): |
119 | VPCMP $0, %YMMMATCH, %YMM2, %k1 |
120 | kmovd %k1, %eax |
121 | blsmskl %ecx, %ecx |
122 | /* eax non-zero if search CHAR in range. */ |
123 | andl %ecx, %eax |
124 | jnz L(first_vec_x1_return) |
125 | |
126 | /* fallthrough: no match in YMM2 then need to check for earlier |
127 | matches (in YMM1). */ |
128 | .p2align 4,, 4 |
129 | L(first_vec_x0_test): |
130 | VPCMP $0, %YMMMATCH, %YMM1, %k1 |
131 | kmovd %k1, %eax |
132 | testl %eax, %eax |
133 | jz L(ret1) |
134 | bsrl %eax, %eax |
135 | # ifdef USE_AS_WCSRCHR |
136 | leaq (%rsi, %rax, CHAR_SIZE), %rax |
137 | # else |
138 | addq %rsi, %rax |
139 | # endif |
140 | L(ret1): |
141 | ret |
142 | |
143 | .p2align 4,, 10 |
144 | L(first_vec_x1_or_x2): |
145 | VPCMP $0, %YMM3, %YMMMATCH, %k3 |
146 | VPCMP $0, %YMM2, %YMMMATCH, %k2 |
147 | /* K2 and K3 have 1 for any search CHAR match. Test if any |
148 | matches between either of them. Otherwise check YMM1. */ |
149 | kortestd %k2, %k3 |
150 | jz L(first_vec_x0_test) |
151 | |
152 | /* Guranteed that YMM2 and YMM3 are within range so merge the |
153 | two bitmasks then get last result. */ |
154 | kunpck %k2, %k3, %k3 |
155 | kmovq %k3, %rax |
156 | bsrq %rax, %rax |
157 | leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax |
158 | ret |
159 | |
160 | .p2align 4,, 6 |
161 | L(first_vec_x3): |
162 | VPCMP $0, %YMMMATCH, %YMM4, %k1 |
163 | kmovd %k1, %eax |
164 | blsmskl %ecx, %ecx |
165 | /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ |
166 | andl %ecx, %eax |
167 | jz L(first_vec_x1_or_x2) |
168 | bsrl %eax, %eax |
169 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
170 | ret |
171 | |
172 | .p2align 4,, 6 |
173 | L(first_vec_x0_x1_test): |
174 | VPCMP $0, %YMMMATCH, %YMM2, %k1 |
175 | kmovd %k1, %eax |
176 | /* Check YMM2 for last match first. If no match try YMM1. */ |
177 | testl %eax, %eax |
178 | jz L(first_vec_x0_test) |
179 | .p2align 4,, 4 |
180 | L(first_vec_x1_return): |
181 | bsrl %eax, %eax |
182 | leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax |
183 | ret |
184 | |
185 | .p2align 4,, 10 |
186 | L(first_vec_x2): |
187 | VPCMP $0, %YMMMATCH, %YMM3, %k1 |
188 | kmovd %k1, %eax |
189 | blsmskl %ecx, %ecx |
190 | /* Check YMM3 for last match first. If no match try YMM2/YMM1. |
191 | */ |
192 | andl %ecx, %eax |
193 | jz L(first_vec_x0_x1_test) |
194 | bsrl %eax, %eax |
195 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
196 | ret |
197 | |
198 | |
199 | .p2align 4 |
200 | L(aligned_more): |
201 | /* Need to keep original pointer incase YMM1 has last match. */ |
202 | movq %rdi, %rsi |
203 | andq $-VEC_SIZE, %rdi |
204 | VMOVU VEC_SIZE(%rdi), %YMM2 |
205 | VPTESTN %YMM2, %YMM2, %k0 |
206 | kmovd %k0, %ecx |
207 | testl %ecx, %ecx |
208 | jnz L(first_vec_x1) |
209 | |
210 | VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 |
211 | VPTESTN %YMM3, %YMM3, %k0 |
212 | kmovd %k0, %ecx |
213 | testl %ecx, %ecx |
214 | jnz L(first_vec_x2) |
215 | |
216 | VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 |
217 | VPTESTN %YMM4, %YMM4, %k0 |
218 | kmovd %k0, %ecx |
219 | movq %rdi, %r8 |
220 | testl %ecx, %ecx |
221 | jnz L(first_vec_x3) |
222 | |
223 | andq $-(VEC_SIZE * 2), %rdi |
224 | .p2align 4 |
225 | L(first_aligned_loop): |
226 | /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee |
227 | they don't store a match. */ |
228 | VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 |
229 | VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 |
230 | |
231 | VPCMP $0, %YMM5, %YMMMATCH, %k2 |
232 | vpxord %YMM6, %YMMMATCH, %YMM7 |
233 | |
234 | VPMIN %YMM5, %YMM6, %YMM8 |
235 | VPMIN %YMM8, %YMM7, %YMM7 |
236 | |
237 | VPTESTN %YMM7, %YMM7, %k1 |
238 | subq $(VEC_SIZE * -2), %rdi |
239 | kortestd %k1, %k2 |
240 | jz L(first_aligned_loop) |
241 | |
242 | VPCMP $0, %YMM6, %YMMMATCH, %k3 |
243 | VPTESTN %YMM8, %YMM8, %k1 |
244 | ktestd %k1, %k1 |
245 | jz L(second_aligned_loop_prep) |
246 | |
247 | kortestd %k2, %k3 |
248 | jnz L(return_first_aligned_loop) |
249 | |
250 | .p2align 4,, 6 |
251 | L(first_vec_x1_or_x2_or_x3): |
252 | VPCMP $0, %YMM4, %YMMMATCH, %k4 |
253 | kmovd %k4, %eax |
254 | testl %eax, %eax |
255 | jz L(first_vec_x1_or_x2) |
256 | bsrl %eax, %eax |
257 | leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax |
258 | ret |
259 | |
260 | .p2align 4,, 8 |
261 | L(return_first_aligned_loop): |
262 | VPTESTN %YMM5, %YMM5, %k0 |
263 | kunpck %k0, %k1, %k0 |
264 | kmov_2x %k0, %maskz_2x |
265 | |
266 | blsmsk %maskz_2x, %maskz_2x |
267 | kunpck %k2, %k3, %k3 |
268 | kmov_2x %k3, %maskm_2x |
269 | and %maskz_2x, %maskm_2x |
270 | jz L(first_vec_x1_or_x2_or_x3) |
271 | |
272 | bsr %maskm_2x, %maskm_2x |
273 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
274 | ret |
275 | |
276 | .p2align 4 |
277 | /* We can throw away the work done for the first 4x checks here |
278 | as we have a later match. This is the 'fast' path persay. |
279 | */ |
280 | L(second_aligned_loop_prep): |
281 | L(second_aligned_loop_set_furthest_match): |
282 | movq %rdi, %rsi |
283 | kunpck %k2, %k3, %k4 |
284 | |
285 | .p2align 4 |
286 | L(second_aligned_loop): |
287 | VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 |
288 | VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 |
289 | |
290 | VPCMP $0, %YMM1, %YMMMATCH, %k2 |
291 | vpxord %YMM2, %YMMMATCH, %YMM3 |
292 | |
293 | VPMIN %YMM1, %YMM2, %YMM4 |
294 | VPMIN %YMM3, %YMM4, %YMM3 |
295 | |
296 | VPTESTN %YMM3, %YMM3, %k1 |
297 | subq $(VEC_SIZE * -2), %rdi |
298 | kortestd %k1, %k2 |
299 | jz L(second_aligned_loop) |
300 | |
301 | VPCMP $0, %YMM2, %YMMMATCH, %k3 |
302 | VPTESTN %YMM4, %YMM4, %k1 |
303 | ktestd %k1, %k1 |
304 | jz L(second_aligned_loop_set_furthest_match) |
305 | |
306 | kortestd %k2, %k3 |
307 | /* branch here because there is a significant advantage interms |
308 | of output dependency chance in using edx. */ |
309 | jnz L(return_new_match) |
310 | L(return_old_match): |
311 | kmovq %k4, %rax |
312 | bsrq %rax, %rax |
313 | leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax |
314 | ret |
315 | |
316 | L(return_new_match): |
317 | VPTESTN %YMM1, %YMM1, %k0 |
318 | kunpck %k0, %k1, %k0 |
319 | kmov_2x %k0, %maskz_2x |
320 | |
321 | blsmsk %maskz_2x, %maskz_2x |
322 | kunpck %k2, %k3, %k3 |
323 | kmov_2x %k3, %maskm_2x |
324 | and %maskz_2x, %maskm_2x |
325 | jz L(return_old_match) |
326 | |
327 | bsr %maskm_2x, %maskm_2x |
328 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
329 | ret |
330 | |
331 | L(cross_page_boundary): |
332 | /* eax contains all the page offset bits of src (rdi). `xor rdi, |
333 | rax` sets pointer will all page offset bits cleared so |
334 | offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC |
335 | before page cross (guranteed to be safe to read). Doing this |
336 | as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves |
337 | a bit of code size. */ |
338 | xorq %rdi, %rax |
339 | VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 |
340 | VPTESTN %YMM1, %YMM1, %k0 |
341 | kmovd %k0, %ecx |
342 | |
343 | /* Shift out zero CHAR matches that are before the begining of |
344 | src (rdi). */ |
345 | # ifdef USE_AS_WCSRCHR |
346 | movl %edi, %esi |
347 | andl $(VEC_SIZE - 1), %esi |
348 | shrl $2, %esi |
349 | # endif |
350 | shrxl %SHIFT_REG, %ecx, %ecx |
351 | |
352 | testl %ecx, %ecx |
353 | jz L(page_cross_continue) |
354 | |
355 | /* Found zero CHAR so need to test for search CHAR. */ |
356 | VPCMP $0, %YMMMATCH, %YMM1, %k1 |
357 | kmovd %k1, %eax |
358 | /* Shift out search CHAR matches that are before the begining of |
359 | src (rdi). */ |
360 | shrxl %SHIFT_REG, %eax, %eax |
361 | |
362 | /* Check if any search CHAR match in range. */ |
363 | blsmskl %ecx, %ecx |
364 | andl %ecx, %eax |
365 | jz L(ret3) |
366 | bsrl %eax, %eax |
367 | # ifdef USE_AS_WCSRCHR |
368 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
369 | # else |
370 | addq %rdi, %rax |
371 | # endif |
372 | L(ret3): |
373 | ret |
374 | |
375 | END(STRRCHR) |
376 | #endif |
377 | |