1/* memrchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define VMOVA vmovdqa64
24
25# define YMMMATCH ymm16
26
27# define VEC_SIZE 32
28
29 .section .text.evex,"ax",@progbits
30ENTRY (__memrchr_evex)
31 /* Broadcast CHAR to YMMMATCH. */
32 vpbroadcastb %esi, %YMMMATCH
33
34 sub $VEC_SIZE, %RDX_LP
35 jbe L(last_vec_or_less)
36
37 add %RDX_LP, %RDI_LP
38
39 /* Check the last VEC_SIZE bytes. */
40 vpcmpb $0, (%rdi), %YMMMATCH, %k1
41 kmovd %k1, %eax
42 testl %eax, %eax
43 jnz L(last_vec_x0)
44
45 subq $(VEC_SIZE * 4), %rdi
46 movl %edi, %ecx
47 andl $(VEC_SIZE - 1), %ecx
48 jz L(aligned_more)
49
50 /* Align data for aligned loads in the loop. */
51 addq $VEC_SIZE, %rdi
52 addq $VEC_SIZE, %rdx
53 andq $-VEC_SIZE, %rdi
54 subq %rcx, %rdx
55
56 .p2align 4
57L(aligned_more):
58 subq $(VEC_SIZE * 4), %rdx
59 jbe L(last_4x_vec_or_less)
60
61 /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
62 since data is only aligned to VEC_SIZE. */
63 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
64 kmovd %k1, %eax
65 testl %eax, %eax
66 jnz L(last_vec_x3)
67
68 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
69 kmovd %k2, %eax
70 testl %eax, %eax
71 jnz L(last_vec_x2)
72
73 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
74 kmovd %k3, %eax
75 testl %eax, %eax
76 jnz L(last_vec_x1)
77
78 vpcmpb $0, (%rdi), %YMMMATCH, %k4
79 kmovd %k4, %eax
80 testl %eax, %eax
81 jnz L(last_vec_x0)
82
83 /* Align data to 4 * VEC_SIZE for loop with fewer branches.
84 There are some overlaps with above if data isn't aligned
85 to 4 * VEC_SIZE. */
86 movl %edi, %ecx
87 andl $(VEC_SIZE * 4 - 1), %ecx
88 jz L(loop_4x_vec)
89
90 addq $(VEC_SIZE * 4), %rdi
91 addq $(VEC_SIZE * 4), %rdx
92 andq $-(VEC_SIZE * 4), %rdi
93 subq %rcx, %rdx
94
95 .p2align 4
96L(loop_4x_vec):
97 /* Compare 4 * VEC at a time forward. */
98 subq $(VEC_SIZE * 4), %rdi
99 subq $(VEC_SIZE * 4), %rdx
100 jbe L(last_4x_vec_or_less)
101
102 vpcmpb $0, (%rdi), %YMMMATCH, %k1
103 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
104 kord %k1, %k2, %k5
105 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
106 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
107
108 kord %k3, %k4, %k6
109 kortestd %k5, %k6
110 jz L(loop_4x_vec)
111
112 /* There is a match. */
113 kmovd %k4, %eax
114 testl %eax, %eax
115 jnz L(last_vec_x3)
116
117 kmovd %k3, %eax
118 testl %eax, %eax
119 jnz L(last_vec_x2)
120
121 kmovd %k2, %eax
122 testl %eax, %eax
123 jnz L(last_vec_x1)
124
125 kmovd %k1, %eax
126 bsrl %eax, %eax
127 addq %rdi, %rax
128 ret
129
130 .p2align 4
131L(last_4x_vec_or_less):
132 addl $(VEC_SIZE * 4), %edx
133 cmpl $(VEC_SIZE * 2), %edx
134 jbe L(last_2x_vec)
135
136 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
137 kmovd %k1, %eax
138 testl %eax, %eax
139 jnz L(last_vec_x3)
140
141 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
142 kmovd %k2, %eax
143 testl %eax, %eax
144 jnz L(last_vec_x2)
145
146 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
147 kmovd %k3, %eax
148 testl %eax, %eax
149 jnz L(last_vec_x1_check)
150 cmpl $(VEC_SIZE * 3), %edx
151 jbe L(zero)
152
153 vpcmpb $0, (%rdi), %YMMMATCH, %k4
154 kmovd %k4, %eax
155 testl %eax, %eax
156 jz L(zero)
157 bsrl %eax, %eax
158 subq $(VEC_SIZE * 4), %rdx
159 addq %rax, %rdx
160 jl L(zero)
161 addq %rdi, %rax
162 ret
163
164 .p2align 4
165L(last_2x_vec):
166 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
167 kmovd %k1, %eax
168 testl %eax, %eax
169 jnz L(last_vec_x3_check)
170 cmpl $VEC_SIZE, %edx
171 jbe L(zero)
172
173 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
174 kmovd %k1, %eax
175 testl %eax, %eax
176 jz L(zero)
177 bsrl %eax, %eax
178 subq $(VEC_SIZE * 2), %rdx
179 addq %rax, %rdx
180 jl L(zero)
181 addl $(VEC_SIZE * 2), %eax
182 addq %rdi, %rax
183 ret
184
185 .p2align 4
186L(last_vec_x0):
187 bsrl %eax, %eax
188 addq %rdi, %rax
189 ret
190
191 .p2align 4
192L(last_vec_x1):
193 bsrl %eax, %eax
194 addl $VEC_SIZE, %eax
195 addq %rdi, %rax
196 ret
197
198 .p2align 4
199L(last_vec_x2):
200 bsrl %eax, %eax
201 addl $(VEC_SIZE * 2), %eax
202 addq %rdi, %rax
203 ret
204
205 .p2align 4
206L(last_vec_x3):
207 bsrl %eax, %eax
208 addl $(VEC_SIZE * 3), %eax
209 addq %rdi, %rax
210 ret
211
212 .p2align 4
213L(last_vec_x1_check):
214 bsrl %eax, %eax
215 subq $(VEC_SIZE * 3), %rdx
216 addq %rax, %rdx
217 jl L(zero)
218 addl $VEC_SIZE, %eax
219 addq %rdi, %rax
220 ret
221
222 .p2align 4
223L(last_vec_x3_check):
224 bsrl %eax, %eax
225 subq $VEC_SIZE, %rdx
226 addq %rax, %rdx
227 jl L(zero)
228 addl $(VEC_SIZE * 3), %eax
229 addq %rdi, %rax
230 ret
231
232 .p2align 4
233L(zero):
234 xorl %eax, %eax
235 ret
236
237 .p2align 4
238L(last_vec_or_less_aligned):
239 movl %edx, %ecx
240
241 vpcmpb $0, (%rdi), %YMMMATCH, %k1
242
243 movl $1, %edx
244 /* Support rdx << 32. */
245 salq %cl, %rdx
246 subq $1, %rdx
247
248 kmovd %k1, %eax
249
250 /* Remove the trailing bytes. */
251 andl %edx, %eax
252 testl %eax, %eax
253 jz L(zero)
254
255 bsrl %eax, %eax
256 addq %rdi, %rax
257 ret
258
259 .p2align 4
260L(last_vec_or_less):
261 addl $VEC_SIZE, %edx
262
263 /* Check for zero length. */
264 testl %edx, %edx
265 jz L(zero)
266
267 movl %edi, %ecx
268 andl $(VEC_SIZE - 1), %ecx
269 jz L(last_vec_or_less_aligned)
270
271 movl %ecx, %esi
272 movl %ecx, %r8d
273 addl %edx, %esi
274 andq $-VEC_SIZE, %rdi
275
276 subl $VEC_SIZE, %esi
277 ja L(last_vec_2x_aligned)
278
279 /* Check the last VEC. */
280 vpcmpb $0, (%rdi), %YMMMATCH, %k1
281 kmovd %k1, %eax
282
283 /* Remove the leading and trailing bytes. */
284 sarl %cl, %eax
285 movl %edx, %ecx
286
287 movl $1, %edx
288 sall %cl, %edx
289 subl $1, %edx
290
291 andl %edx, %eax
292 testl %eax, %eax
293 jz L(zero)
294
295 bsrl %eax, %eax
296 addq %rdi, %rax
297 addq %r8, %rax
298 ret
299
300 .p2align 4
301L(last_vec_2x_aligned):
302 movl %esi, %ecx
303
304 /* Check the last VEC. */
305 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
306
307 movl $1, %edx
308 sall %cl, %edx
309 subl $1, %edx
310
311 kmovd %k1, %eax
312
313 /* Remove the trailing bytes. */
314 andl %edx, %eax
315
316 testl %eax, %eax
317 jnz L(last_vec_x1)
318
319 /* Check the second last VEC. */
320 vpcmpb $0, (%rdi), %YMMMATCH, %k1
321
322 movl %r8d, %ecx
323
324 kmovd %k1, %eax
325
326 /* Remove the leading bytes. Must use unsigned right shift for
327 bsrl below. */
328 shrl %cl, %eax
329 testl %eax, %eax
330 jz L(zero)
331
332 bsrl %eax, %eax
333 addq %rdi, %rax
334 addq %r8, %rax
335 ret
336END (__memrchr_evex)
337#endif
338