1 | /* memrchr optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef VEC_SIZE |
26 | # include "x86-evex256-vecs.h" |
27 | # endif |
28 | |
29 | # include "reg-macros.h" |
30 | |
31 | # ifndef MEMRCHR |
32 | # define MEMRCHR __memrchr_evex |
33 | # endif |
34 | |
35 | # define PAGE_SIZE 4096 |
36 | # define VMATCH VMM(0) |
37 | |
38 | .section SECTION(.text), "ax" , @progbits |
39 | ENTRY_P2ALIGN(MEMRCHR, 6) |
40 | # ifdef __ILP32__ |
41 | /* Clear upper bits. */ |
42 | and %RDX_LP, %RDX_LP |
43 | # else |
44 | test %RDX_LP, %RDX_LP |
45 | # endif |
46 | jz L(zero_0) |
47 | |
48 | /* Get end pointer. Minus one for three reasons. 1) It is |
49 | necessary for a correct page cross check and 2) it correctly |
50 | sets up end ptr to be subtract by lzcnt aligned. 3) it is a |
51 | necessary step in aligning ptr. */ |
52 | leaq -1(%rdi, %rdx), %rax |
53 | vpbroadcastb %esi, %VMATCH |
54 | |
55 | /* Check if we can load 1x VEC without cross a page. */ |
56 | testl $(PAGE_SIZE - VEC_SIZE), %eax |
57 | jz L(page_cross) |
58 | |
59 | /* Don't use rax for pointer here because EVEX has better |
60 | encoding with offset % VEC_SIZE == 0. */ |
61 | vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0 |
62 | KMOV %k0, %VRCX |
63 | |
64 | /* If rcx is zero then lzcnt -> VEC_SIZE. NB: there is a |
65 | already a dependency between rcx and rsi so no worries about |
66 | false-dep here. */ |
67 | lzcnt %VRCX, %VRSI |
68 | /* If rdx <= rsi then either 1) rcx was non-zero (there was a |
69 | match) but it was out of bounds or 2) rcx was zero and rdx |
70 | was <= VEC_SIZE so we are done scanning. */ |
71 | cmpq %rsi, %rdx |
72 | /* NB: Use branch to return zero/non-zero. Common usage will |
73 | branch on result of function (if return is null/non-null). |
74 | This branch can be used to predict the ensuing one so there |
75 | is no reason to extend the data-dependency with cmovcc. */ |
76 | jbe L(zero_0) |
77 | |
78 | /* If rcx is zero then len must be > RDX, otherwise since we |
79 | already tested len vs lzcnt(rcx) (in rsi) we are good to |
80 | return this match. */ |
81 | test %VRCX, %VRCX |
82 | jz L(more_1x_vec) |
83 | subq %rsi, %rax |
84 | ret |
85 | |
86 | /* Fits in aligning bytes of first cache line for VEC_SIZE == |
87 | 32. */ |
88 | # if VEC_SIZE == 32 |
89 | .p2align 4,, 2 |
90 | L(zero_0): |
91 | xorl %eax, %eax |
92 | ret |
93 | # endif |
94 | |
95 | .p2align 4,, 10 |
96 | L(more_1x_vec): |
97 | /* Align rax (pointer to string). */ |
98 | andq $-VEC_SIZE, %rax |
99 | L(page_cross_continue): |
100 | /* Recompute length after aligning. */ |
101 | subq %rdi, %rax |
102 | |
103 | cmpq $(VEC_SIZE * 2), %rax |
104 | ja L(more_2x_vec) |
105 | |
106 | L(last_2x_vec): |
107 | vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 |
108 | KMOV %k0, %VRCX |
109 | |
110 | test %VRCX, %VRCX |
111 | jnz L(ret_vec_x0_test) |
112 | |
113 | /* If VEC_SIZE == 64 need to subtract because lzcntq won't |
114 | implicitly add VEC_SIZE to match position. */ |
115 | # if VEC_SIZE == 64 |
116 | subl $VEC_SIZE, %eax |
117 | # else |
118 | cmpb $VEC_SIZE, %al |
119 | # endif |
120 | jle L(zero_2) |
121 | |
122 | /* We adjusted rax (length) for VEC_SIZE == 64 so need seperate |
123 | offsets. */ |
124 | # if VEC_SIZE == 64 |
125 | vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 |
126 | # else |
127 | vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0 |
128 | # endif |
129 | KMOV %k0, %VRCX |
130 | /* NB: 64-bit lzcnt. This will naturally add 32 to position for |
131 | VEC_SIZE == 32. */ |
132 | lzcntq %rcx, %rcx |
133 | subl %ecx, %eax |
134 | ja L(first_vec_x1_ret) |
135 | /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the |
136 | first cache line (this is the second cache line). */ |
137 | # if VEC_SIZE == 64 |
138 | L(zero_0): |
139 | # endif |
140 | L(zero_2): |
141 | xorl %eax, %eax |
142 | ret |
143 | |
144 | /* NB: Fits in aligning bytes before next cache line for |
145 | VEC_SIZE == 32. For VEC_SIZE == 64 this is attached to |
146 | L(first_vec_x0_test). */ |
147 | # if VEC_SIZE == 32 |
148 | L(first_vec_x1_ret): |
149 | leaq -1(%rdi, %rax), %rax |
150 | ret |
151 | # endif |
152 | |
153 | .p2align 4,, 6 |
154 | L(ret_vec_x0_test): |
155 | lzcnt %VRCX, %VRCX |
156 | subl %ecx, %eax |
157 | jle L(zero_2) |
158 | # if VEC_SIZE == 64 |
159 | /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit |
160 | L(first_vec_x1_ret) in the same cache line as its jmp base |
161 | so we might as well save code size. */ |
162 | L(first_vec_x1_ret): |
163 | # endif |
164 | leaq -1(%rdi, %rax), %rax |
165 | ret |
166 | |
167 | .p2align 4,, 6 |
168 | L(loop_last_4x_vec): |
169 | /* Compute remaining length. */ |
170 | subl %edi, %eax |
171 | L(last_4x_vec): |
172 | cmpl $(VEC_SIZE * 2), %eax |
173 | jle L(last_2x_vec) |
174 | # if VEC_SIZE == 32 |
175 | /* Only align for VEC_SIZE == 32. For VEC_SIZE == 64 we need |
176 | the spare bytes to align the loop properly. */ |
177 | .p2align 4,, 10 |
178 | # endif |
179 | L(more_2x_vec): |
180 | |
181 | /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and |
182 | return if either hit. */ |
183 | vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 |
184 | KMOV %k0, %VRCX |
185 | |
186 | test %VRCX, %VRCX |
187 | jnz L(first_vec_x0) |
188 | |
189 | vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0 |
190 | KMOV %k0, %VRCX |
191 | test %VRCX, %VRCX |
192 | jnz L(first_vec_x1) |
193 | |
194 | /* Need no matter what. */ |
195 | vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0 |
196 | KMOV %k0, %VRCX |
197 | |
198 | /* Check if we are near the end. */ |
199 | subq $(VEC_SIZE * 4), %rax |
200 | ja L(more_4x_vec) |
201 | |
202 | test %VRCX, %VRCX |
203 | jnz L(first_vec_x2_test) |
204 | |
205 | /* Adjust length for final check and check if we are at the end. |
206 | */ |
207 | addl $(VEC_SIZE * 1), %eax |
208 | jle L(zero_1) |
209 | |
210 | vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 |
211 | KMOV %k0, %VRCX |
212 | |
213 | lzcnt %VRCX, %VRCX |
214 | subl %ecx, %eax |
215 | ja L(first_vec_x3_ret) |
216 | L(zero_1): |
217 | xorl %eax, %eax |
218 | ret |
219 | L(first_vec_x3_ret): |
220 | leaq -1(%rdi, %rax), %rax |
221 | ret |
222 | |
223 | .p2align 4,, 6 |
224 | L(first_vec_x2_test): |
225 | /* Must adjust length before check. */ |
226 | subl $-(VEC_SIZE * 2 - 1), %eax |
227 | lzcnt %VRCX, %VRCX |
228 | subl %ecx, %eax |
229 | jl L(zero_4) |
230 | addq %rdi, %rax |
231 | ret |
232 | |
233 | |
234 | .p2align 4,, 10 |
235 | L(first_vec_x0): |
236 | bsr %VRCX, %VRCX |
237 | leaq (VEC_SIZE * -1)(%rdi, %rax), %rax |
238 | addq %rcx, %rax |
239 | ret |
240 | |
241 | /* Fits unobtrusively here. */ |
242 | L(zero_4): |
243 | xorl %eax, %eax |
244 | ret |
245 | |
246 | .p2align 4,, 10 |
247 | L(first_vec_x1): |
248 | bsr %VRCX, %VRCX |
249 | leaq (VEC_SIZE * -2)(%rdi, %rax), %rax |
250 | addq %rcx, %rax |
251 | ret |
252 | |
253 | .p2align 4,, 8 |
254 | L(first_vec_x3): |
255 | bsr %VRCX, %VRCX |
256 | addq %rdi, %rax |
257 | addq %rcx, %rax |
258 | ret |
259 | |
260 | .p2align 4,, 6 |
261 | L(first_vec_x2): |
262 | bsr %VRCX, %VRCX |
263 | leaq (VEC_SIZE * 1)(%rdi, %rax), %rax |
264 | addq %rcx, %rax |
265 | ret |
266 | |
267 | .p2align 4,, 2 |
268 | L(more_4x_vec): |
269 | test %VRCX, %VRCX |
270 | jnz L(first_vec_x2) |
271 | |
272 | vpcmpeqb (%rdi, %rax), %VMATCH, %k0 |
273 | KMOV %k0, %VRCX |
274 | |
275 | test %VRCX, %VRCX |
276 | jnz L(first_vec_x3) |
277 | |
278 | /* Check if near end before re-aligning (otherwise might do an |
279 | unnecessary loop iteration). */ |
280 | cmpq $(VEC_SIZE * 4), %rax |
281 | jbe L(last_4x_vec) |
282 | |
283 | |
284 | /* NB: We setup the loop to NOT use index-address-mode for the |
285 | buffer. This costs some instructions & code size but avoids |
286 | stalls due to unlaminated micro-fused instructions (as used |
287 | in the loop) from being forced to issue in the same group |
288 | (essentially narrowing the backend width). */ |
289 | |
290 | /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi |
291 | because lengths that overflow can be valid and break the |
292 | comparison. */ |
293 | # if VEC_SIZE == 64 |
294 | /* Use rdx as intermediate to compute rax, this gets us imm8 |
295 | encoding which just allows the L(more_4x_vec) block to fit |
296 | in 1 cache-line. */ |
297 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
298 | leaq (VEC_SIZE * -1)(%rdx, %rax), %rax |
299 | |
300 | /* No evex machine has partial register stalls. This can be |
301 | replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that |
302 | changes. */ |
303 | xorb %al, %al |
304 | xorb %dl, %dl |
305 | # else |
306 | leaq (VEC_SIZE * 3)(%rdi, %rax), %rax |
307 | andq $(VEC_SIZE * -4), %rax |
308 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
309 | andq $(VEC_SIZE * -4), %rdx |
310 | # endif |
311 | |
312 | |
313 | .p2align 4 |
314 | L(loop_4x_vec): |
315 | /* NB: We could do the same optimization here as we do for |
316 | memchr/rawmemchr by using VEX encoding in the loop for access |
317 | to VEX vpcmpeqb + vpternlogd. Since memrchr is not as hot as |
318 | memchr it may not be worth the extra code size, but if the |
319 | need arises it an easy ~15% perf improvement to the loop. */ |
320 | |
321 | cmpq %rdx, %rax |
322 | je L(loop_last_4x_vec) |
323 | /* Store 1 were not-equals and 0 where equals in k1 (used to |
324 | mask later on). */ |
325 | vpcmpb $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1 |
326 | |
327 | /* VEC(2/3) will have zero-byte where we found a CHAR. */ |
328 | vpxorq (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2) |
329 | vpxorq (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3) |
330 | vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4 |
331 | |
332 | /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit |
333 | where CHAR is found and VEC(2/3) have zero-byte where CHAR |
334 | is found. */ |
335 | vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z} |
336 | vptestnmb %VMM(3), %VMM(3), %k2 |
337 | |
338 | addq $-(VEC_SIZE * 4), %rax |
339 | |
340 | /* Any 1s and we found CHAR. */ |
341 | KORTEST %k2, %k4 |
342 | jz L(loop_4x_vec) |
343 | |
344 | |
345 | /* K1 has non-matches for first VEC. inc; jz will overflow rcx |
346 | iff all bytes where non-matches. */ |
347 | KMOV %k1, %VRCX |
348 | inc %VRCX |
349 | jnz L(first_vec_x0_end) |
350 | |
351 | vptestnmb %VMM(2), %VMM(2), %k0 |
352 | KMOV %k0, %VRCX |
353 | test %VRCX, %VRCX |
354 | jnz L(first_vec_x1_end) |
355 | KMOV %k2, %VRCX |
356 | |
357 | /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for |
358 | returning last 2x VEC. For VEC_SIZE == 64 we test each VEC |
359 | individually, for VEC_SIZE == 32 we combine them in a single |
360 | 64-bit GPR. */ |
361 | # if VEC_SIZE == 64 |
362 | test %VRCX, %VRCX |
363 | jnz L(first_vec_x2_end) |
364 | KMOV %k4, %VRCX |
365 | # else |
366 | /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from |
367 | VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the |
368 | result in rsi (from VEC(4)). If rcx is non-zero then CHAR in |
369 | VEC(3) and bsrq will use that position. */ |
370 | KMOV %k4, %VRSI |
371 | salq $32, %rcx |
372 | orq %rsi, %rcx |
373 | # endif |
374 | bsrq %rcx, %rcx |
375 | addq %rcx, %rax |
376 | ret |
377 | |
378 | .p2align 4,, 4 |
379 | L(first_vec_x0_end): |
380 | /* rcx has 1s at non-matches so we need to `not` it. We used |
381 | `inc` to test if zero so use `neg` to complete the `not` so |
382 | the last 1 bit represent a match. NB: (-x + 1 == ~x). */ |
383 | neg %VRCX |
384 | bsr %VRCX, %VRCX |
385 | leaq (VEC_SIZE * 3)(%rcx, %rax), %rax |
386 | ret |
387 | |
388 | .p2align 4,, 10 |
389 | L(first_vec_x1_end): |
390 | bsr %VRCX, %VRCX |
391 | leaq (VEC_SIZE * 2)(%rcx, %rax), %rax |
392 | ret |
393 | |
394 | # if VEC_SIZE == 64 |
395 | /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 |
396 | need return label for it. */ |
397 | .p2align 4,, 4 |
398 | L(first_vec_x2_end): |
399 | bsr %VRCX, %VRCX |
400 | leaq (VEC_SIZE * 1)(%rcx, %rax), %rax |
401 | ret |
402 | # endif |
403 | |
404 | |
405 | .p2align 4,, 4 |
406 | L(page_cross): |
407 | /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can |
408 | use movzbl to get the amount of bytes we are checking here. |
409 | */ |
410 | movzbl %al, %ecx |
411 | andq $-VEC_SIZE, %rax |
412 | vpcmpeqb (%rax), %VMATCH, %k0 |
413 | KMOV %k0, %VRSI |
414 | |
415 | /* eax was comptued as %rdi + %rdx - 1 so need to add back 1 |
416 | here. */ |
417 | leal 1(%rcx), %r8d |
418 | |
419 | /* Invert ecx to get shift count for byte matches out of range. |
420 | */ |
421 | notl %ecx |
422 | shlx %VRCX, %VRSI, %VRSI |
423 | |
424 | /* if r8 < rdx then the entire [buf, buf + len] is handled in |
425 | the page cross case. NB: we can't use the trick here we use |
426 | in the non page-cross case because we aren't checking full |
427 | VEC_SIZE. */ |
428 | cmpq %r8, %rdx |
429 | ja L(page_cross_check) |
430 | lzcnt %VRSI, %VRSI |
431 | subl %esi, %edx |
432 | ja L(page_cross_ret) |
433 | xorl %eax, %eax |
434 | ret |
435 | |
436 | L(page_cross_check): |
437 | test %VRSI, %VRSI |
438 | jz L(page_cross_continue) |
439 | |
440 | lzcnt %VRSI, %VRSI |
441 | subl %esi, %edx |
442 | L(page_cross_ret): |
443 | leaq -1(%rdi, %rdx), %rax |
444 | ret |
445 | END(MEMRCHR) |
446 | #endif |
447 | |