1/* memrchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24# include "evex256-vecs.h"
25# if VEC_SIZE != 32
26# error "VEC_SIZE != 32 unimplemented"
27# endif
28
29# ifndef MEMRCHR
30# define MEMRCHR __memrchr_evex
31# endif
32
33# define PAGE_SIZE 4096
34# define VECMATCH VEC(0)
35
36 .section SECTION(.text), "ax", @progbits
37ENTRY_P2ALIGN(MEMRCHR, 6)
38# ifdef __ILP32__
39 /* Clear upper bits. */
40 and %RDX_LP, %RDX_LP
41# else
42 test %RDX_LP, %RDX_LP
43# endif
44 jz L(zero_0)
45
46 /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
47 correct page cross check and 2) it correctly sets up end ptr to be
48 subtract by lzcnt aligned. */
49 leaq -1(%rdi, %rdx), %rax
50 vpbroadcastb %esi, %VECMATCH
51
52 /* Check if we can load 1x VEC without cross a page. */
53 testl $(PAGE_SIZE - VEC_SIZE), %eax
54 jz L(page_cross)
55
56 /* Don't use rax for pointer here because EVEX has better encoding with
57 offset % VEC_SIZE == 0. */
58 vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
59 kmovd %k0, %ecx
60
61 /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */
62 cmpq $VEC_SIZE, %rdx
63 ja L(more_1x_vec)
64L(ret_vec_x0_test):
65
66 /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
67 will guarantee edx (len) is less than it. */
68 lzcntl %ecx, %ecx
69 cmpl %ecx, %edx
70 jle L(zero_0)
71 subq %rcx, %rax
72 ret
73
74 /* Fits in aligning bytes of first cache line. */
75L(zero_0):
76 xorl %eax, %eax
77 ret
78
79 .p2align 4,, 9
80L(ret_vec_x0_dec):
81 decq %rax
82L(ret_vec_x0):
83 lzcntl %ecx, %ecx
84 subq %rcx, %rax
85 ret
86
87 .p2align 4,, 10
88L(more_1x_vec):
89 testl %ecx, %ecx
90 jnz L(ret_vec_x0)
91
92 /* Align rax (pointer to string). */
93 andq $-VEC_SIZE, %rax
94
95 /* Recompute length after aligning. */
96 movq %rax, %rdx
97
98 /* Need no matter what. */
99 vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
100 kmovd %k0, %ecx
101
102 subq %rdi, %rdx
103
104 cmpq $(VEC_SIZE * 2), %rdx
105 ja L(more_2x_vec)
106L(last_2x_vec):
107
108 /* Must dec rax because L(ret_vec_x0_test) expects it. */
109 decq %rax
110 cmpl $VEC_SIZE, %edx
111 jbe L(ret_vec_x0_test)
112
113 testl %ecx, %ecx
114 jnz L(ret_vec_x0)
115
116 /* Don't use rax for pointer here because EVEX has better encoding with
117 offset % VEC_SIZE == 0. */
118 vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
119 kmovd %k0, %ecx
120 /* NB: 64-bit lzcnt. This will naturally add 32 to position. */
121 lzcntq %rcx, %rcx
122 cmpl %ecx, %edx
123 jle L(zero_0)
124 subq %rcx, %rax
125 ret
126
127 /* Inexpensive place to put this regarding code size / target alignments
128 / ICache NLP. Necessary for 2-byte encoding of jump to page cross
129 case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
130 in first cache line. */
131L(page_cross):
132 movq %rax, %rsi
133 andq $-VEC_SIZE, %rsi
134 vpcmpb $0, (%rsi), %VECMATCH, %k0
135 kmovd %k0, %r8d
136 /* Shift out negative alignment (because we are starting from endptr and
137 working backwards). */
138 movl %eax, %ecx
139 /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */
140 notl %ecx
141 shlxl %ecx, %r8d, %ecx
142 cmpq %rdi, %rsi
143 ja L(more_1x_vec)
144 lzcntl %ecx, %ecx
145 cmpl %ecx, %edx
146 jle L(zero_1)
147 subq %rcx, %rax
148 ret
149
150 /* Continue creating zero labels that fit in aligning bytes and get
151 2-byte encoding / are in the same cache line as condition. */
152L(zero_1):
153 xorl %eax, %eax
154 ret
155
156 .p2align 4,, 8
157L(ret_vec_x1):
158 /* This will naturally add 32 to position. */
159 bsrl %ecx, %ecx
160 leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
161 ret
162
163 .p2align 4,, 8
164L(more_2x_vec):
165 testl %ecx, %ecx
166 jnz L(ret_vec_x0_dec)
167
168 vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
169 kmovd %k0, %ecx
170 testl %ecx, %ecx
171 jnz L(ret_vec_x1)
172
173 /* Need no matter what. */
174 vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
175 kmovd %k0, %ecx
176
177 subq $(VEC_SIZE * 4), %rdx
178 ja L(more_4x_vec)
179
180 cmpl $(VEC_SIZE * -1), %edx
181 jle L(ret_vec_x2_test)
182L(last_vec):
183 testl %ecx, %ecx
184 jnz L(ret_vec_x2)
185
186
187 /* Need no matter what. */
188 vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
189 kmovd %k0, %ecx
190 lzcntl %ecx, %ecx
191 subq $(VEC_SIZE * 3 + 1), %rax
192 subq %rcx, %rax
193 cmpq %rax, %rdi
194 ja L(zero_1)
195 ret
196
197 .p2align 4,, 8
198L(ret_vec_x2_test):
199 lzcntl %ecx, %ecx
200 subq $(VEC_SIZE * 2 + 1), %rax
201 subq %rcx, %rax
202 cmpq %rax, %rdi
203 ja L(zero_1)
204 ret
205
206 .p2align 4,, 8
207L(ret_vec_x2):
208 bsrl %ecx, %ecx
209 leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
210 ret
211
212 .p2align 4,, 8
213L(ret_vec_x3):
214 bsrl %ecx, %ecx
215 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
216 ret
217
218 .p2align 4,, 8
219L(more_4x_vec):
220 testl %ecx, %ecx
221 jnz L(ret_vec_x2)
222
223 vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
224 kmovd %k0, %ecx
225
226 testl %ecx, %ecx
227 jnz L(ret_vec_x3)
228
229 /* Check if near end before re-aligning (otherwise might do an
230 unnecessary loop iteration). */
231 addq $-(VEC_SIZE * 4), %rax
232 cmpq $(VEC_SIZE * 4), %rdx
233 jbe L(last_4x_vec)
234
235 decq %rax
236 andq $-(VEC_SIZE * 4), %rax
237 movq %rdi, %rdx
238 /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
239 lengths that overflow can be valid and break the comparison. */
240 andq $-(VEC_SIZE * 4), %rdx
241
242 .p2align 4
243L(loop_4x_vec):
244 /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
245 on). */
246 vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
247
248 /* VEC(2/3) will have zero-byte where we found a CHAR. */
249 vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
250 vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
251 vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
252
253 /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
254 CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */
255 vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
256 vptestnmb %VEC(3), %VEC(3), %k2
257
258 /* Any 1s and we found CHAR. */
259 kortestd %k2, %k4
260 jnz L(loop_end)
261
262 addq $-(VEC_SIZE * 4), %rax
263 cmpq %rdx, %rax
264 jne L(loop_4x_vec)
265
266 /* Need to re-adjust rdx / rax for L(last_4x_vec). */
267 subq $-(VEC_SIZE * 4), %rdx
268 movq %rdx, %rax
269 subl %edi, %edx
270L(last_4x_vec):
271
272 /* Used no matter what. */
273 vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
274 kmovd %k0, %ecx
275
276 cmpl $(VEC_SIZE * 2), %edx
277 jbe L(last_2x_vec)
278
279 testl %ecx, %ecx
280 jnz L(ret_vec_x0_dec)
281
282
283 vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
284 kmovd %k0, %ecx
285
286 testl %ecx, %ecx
287 jnz L(ret_vec_x1)
288
289 /* Used no matter what. */
290 vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
291 kmovd %k0, %ecx
292
293 cmpl $(VEC_SIZE * 3), %edx
294 ja L(last_vec)
295
296 lzcntl %ecx, %ecx
297 subq $(VEC_SIZE * 2 + 1), %rax
298 subq %rcx, %rax
299 cmpq %rax, %rdi
300 jbe L(ret_1)
301 xorl %eax, %eax
302L(ret_1):
303 ret
304
305 .p2align 4,, 6
306L(loop_end):
307 kmovd %k1, %ecx
308 notl %ecx
309 testl %ecx, %ecx
310 jnz L(ret_vec_x0_end)
311
312 vptestnmb %VEC(2), %VEC(2), %k0
313 kmovd %k0, %ecx
314 testl %ecx, %ecx
315 jnz L(ret_vec_x1_end)
316
317 kmovd %k2, %ecx
318 kmovd %k4, %esi
319 /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
320 then it won't affect the result in esi (VEC4). If ecx is non-zero
321 then CHAR in VEC3 and bsrq will use that position. */
322 salq $32, %rcx
323 orq %rsi, %rcx
324 bsrq %rcx, %rcx
325 addq %rcx, %rax
326 ret
327 .p2align 4,, 4
328L(ret_vec_x0_end):
329 addq $(VEC_SIZE), %rax
330L(ret_vec_x1_end):
331 bsrl %ecx, %ecx
332 leaq (VEC_SIZE * 2)(%rax, %rcx), %rax
333 ret
334
335END(MEMRCHR)
336#endif
337