memrchr-evex.S source code [glibc/sysdeps/x86_64/multiarch/memrchr-evex.S]

1	/ memrchr optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2021-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	# include <sysdep.h>
24	# include "evex256-vecs.h"
25	# if VEC_SIZE != 32
26	# error "VEC_SIZE != 32 unimplemented"
27	# endif
28
29	# ifndef MEMRCHR
30	# define MEMRCHR __memrchr_evex
31	# endif
32
33	# define PAGE_SIZE 4096
34	# define VECMATCH VEC(0)
35
36	.section SECTION(.text), "ax", @progbits
37	ENTRY_P2ALIGN(MEMRCHR, `6`)
38	# ifdef __ILP32__
39	/ Clear upper bits. /
40	and %RDX_LP, %RDX_LP
41	# else
42	test %RDX_LP, %RDX_LP
43	# endif
44	jz L(zero_0)
45
46	/ Get end pointer. Minus one for two reasons. 1) It is necessary for a*
47	correct page cross check and 2) it correctly sets up end ptr to be
48	subtract by lzcnt aligned. /*
49	leaq -`1`(%rdi, %rdx), %rax
50	vpbroadcastb %esi, %VECMATCH
51
52	/ Check if we can load 1x VEC without cross a page. /
53	testl $(PAGE_SIZE - VEC_SIZE), %eax
54	jz L(page_cross)
55
56	/ Don't use rax for pointer here because EVEX has better encoding with*
57	offset % VEC_SIZE == 0. /*
58	vpcmpb $`0`, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
59	kmovd %k0, %ecx
60
61	/ Fall through for rdx (len) <= VEC_SIZE (expect small sizes). /
62	cmpq $VEC_SIZE, %rdx
63	ja L(more_1x_vec)
64	L(ret_vec_x0_test):
65
66	/ If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which*
67	will guarantee edx (len) is less than it. /*
68	lzcntl %ecx, %ecx
69	cmpl %ecx, %edx
70	jle L(zero_0)
71	subq %rcx, %rax
72	ret
73
74	/ Fits in aligning bytes of first cache line. /
75	L(zero_0):
76	xorl %eax, %eax
77	ret
78
79	.p2align `4`,, `9`
80	L(ret_vec_x0_dec):
81	decq %rax
82	L(ret_vec_x0):
83	lzcntl %ecx, %ecx
84	subq %rcx, %rax
85	ret
86
87	.p2align `4`,, `10`
88	L(more_1x_vec):
89	testl %ecx, %ecx
90	jnz L(ret_vec_x0)
91
92	/ Align rax (pointer to string). /
93	andq $-VEC_SIZE, %rax
94
95	/ Recompute length after aligning. /
96	movq %rax, %rdx
97
98	/ Need no matter what. /
99	vpcmpb $`0`, -(VEC_SIZE)(%rax), %VECMATCH, %k0
100	kmovd %k0, %ecx
101
102	subq %rdi, %rdx
103
104	cmpq $(VEC_SIZE * `2`), %rdx
105	ja L(more_2x_vec)
106	L(last_2x_vec):
107
108	/ Must dec rax because L(ret_vec_x0_test) expects it. /
109	decq %rax
110	cmpl $VEC_SIZE, %edx
111	jbe L(ret_vec_x0_test)
112
113	testl %ecx, %ecx
114	jnz L(ret_vec_x0)
115
116	/ Don't use rax for pointer here because EVEX has better encoding with*
117	offset % VEC_SIZE == 0. /*
118	vpcmpb $`0`, -(VEC_SIZE * `2`)(%rdi, %rdx), %VECMATCH, %k0
119	kmovd %k0, %ecx
120	/ NB: 64-bit lzcnt. This will naturally add 32 to position. /
121	lzcntq %rcx, %rcx
122	cmpl %ecx, %edx
123	jle L(zero_0)
124	subq %rcx, %rax
125	ret
126
127	/ Inexpensive place to put this regarding code size / target alignments*
128	/ ICache NLP. Necessary for 2-byte encoding of jump to page cross
129	case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
130	in first cache line. /*
131	L(page_cross):
132	movq %rax, %rsi
133	andq $-VEC_SIZE, %rsi
134	vpcmpb $`0`, (%rsi), %VECMATCH, %k0
135	kmovd %k0, %r8d
136	/ Shift out negative alignment (because we are starting from endptr and*
137	working backwards). /*
138	movl %eax, %ecx
139	/ notl because eax already has endptr - 1. (-x = ~(x - 1)). /
140	notl %ecx
141	shlxl %ecx, %r8d, %ecx
142	cmpq %rdi, %rsi
143	ja L(more_1x_vec)
144	lzcntl %ecx, %ecx
145	cmpl %ecx, %edx
146	jle L(zero_1)
147	subq %rcx, %rax
148	ret
149
150	/ Continue creating zero labels that fit in aligning bytes and get*
151	2-byte encoding / are in the same cache line as condition. /*
152	L(zero_1):
153	xorl %eax, %eax
154	ret
155
156	.p2align `4`,, `8`
157	L(ret_vec_x1):
158	/ This will naturally add 32 to position. /
159	bsrl %ecx, %ecx
160	leaq -(VEC_SIZE * `2`)(%rcx, %rax), %rax
161	ret
162
163	.p2align `4`,, `8`
164	L(more_2x_vec):
165	testl %ecx, %ecx
166	jnz L(ret_vec_x0_dec)
167
168	vpcmpb $`0`, -(VEC_SIZE * `2`)(%rax), %VECMATCH, %k0
169	kmovd %k0, %ecx
170	testl %ecx, %ecx
171	jnz L(ret_vec_x1)
172
173	/ Need no matter what. /
174	vpcmpb $`0`, -(VEC_SIZE * `3`)(%rax), %VECMATCH, %k0
175	kmovd %k0, %ecx
176
177	subq $(VEC_SIZE * `4`), %rdx
178	ja L(more_4x_vec)
179
180	cmpl $(VEC_SIZE * -`1`), %edx
181	jle L(ret_vec_x2_test)
182	L(last_vec):
183	testl %ecx, %ecx
184	jnz L(ret_vec_x2)
185
186
187	/ Need no matter what. /
188	vpcmpb $`0`, -(VEC_SIZE * `4`)(%rax), %VECMATCH, %k0
189	kmovd %k0, %ecx
190	lzcntl %ecx, %ecx
191	subq $(VEC_SIZE * `3` + `1`), %rax
192	subq %rcx, %rax
193	cmpq %rax, %rdi
194	ja L(zero_1)
195	ret
196
197	.p2align `4`,, `8`
198	L(ret_vec_x2_test):
199	lzcntl %ecx, %ecx
200	subq $(VEC_SIZE * `2` + `1`), %rax
201	subq %rcx, %rax
202	cmpq %rax, %rdi
203	ja L(zero_1)
204	ret
205
206	.p2align `4`,, `8`
207	L(ret_vec_x2):
208	bsrl %ecx, %ecx
209	leaq -(VEC_SIZE * `3`)(%rcx, %rax), %rax
210	ret
211
212	.p2align `4`,, `8`
213	L(ret_vec_x3):
214	bsrl %ecx, %ecx
215	leaq -(VEC_SIZE * `4`)(%rcx, %rax), %rax
216	ret
217
218	.p2align `4`,, `8`
219	L(more_4x_vec):
220	testl %ecx, %ecx
221	jnz L(ret_vec_x2)
222
223	vpcmpb $`0`, -(VEC_SIZE * `4`)(%rax), %VECMATCH, %k0
224	kmovd %k0, %ecx
225
226	testl %ecx, %ecx
227	jnz L(ret_vec_x3)
228
229	/ Check if near end before re-aligning (otherwise might do an*
230	unnecessary loop iteration). /*
231	addq $-(VEC_SIZE * `4`), %rax
232	cmpq $(VEC_SIZE * `4`), %rdx
233	jbe L(last_4x_vec)
234
235	decq %rax
236	andq $-(VEC_SIZE * `4`), %rax
237	movq %rdi, %rdx
238	/ Get endptr for loop in rdx. NB: Can't just do while rax > rdi because*
239	lengths that overflow can be valid and break the comparison. /*
240	andq $-(VEC_SIZE * `4`), %rdx
241
242	.p2align `4`
243	L(loop_4x_vec):
244	/ Store 1 were not-equals and 0 where equals in k1 (used to mask later*
245	on). /*
246	vpcmpb $`4`, (VEC_SIZE * `3`)(%rax), %VECMATCH, %k1
247
248	/ VEC(2/3) will have zero-byte where we found a CHAR. /
249	vpxorq (VEC_SIZE * `2`)(%rax), %VECMATCH, %VEC(`2`)
250	vpxorq (VEC_SIZE * `1`)(%rax), %VECMATCH, %VEC(`3`)
251	vpcmpb $`0`, (VEC_SIZE * `0`)(%rax), %VECMATCH, %k4
252
253	/ Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where*
254	CHAR is found and VEC(2/3) have zero-byte where CHAR is found. /*
255	vpminub %VEC(`2`), %VEC(`3`), %VEC(`3`){%k1}{z}
256	vptestnmb %VEC(`3`), %VEC(`3`), %k2
257
258	/ Any 1s and we found CHAR. /
259	kortestd %k2, %k4
260	jnz L(loop_end)
261
262	addq $-(VEC_SIZE * `4`), %rax
263	cmpq %rdx, %rax
264	jne L(loop_4x_vec)
265
266	/ Need to re-adjust rdx / rax for L(last_4x_vec). /
267	subq $-(VEC_SIZE * `4`), %rdx
268	movq %rdx, %rax
269	subl %edi, %edx
270	L(last_4x_vec):
271
272	/ Used no matter what. /
273	vpcmpb $`0`, (VEC_SIZE * -`1`)(%rax), %VECMATCH, %k0
274	kmovd %k0, %ecx
275
276	cmpl $(VEC_SIZE * `2`), %edx
277	jbe L(last_2x_vec)
278
279	testl %ecx, %ecx
280	jnz L(ret_vec_x0_dec)
281
282
283	vpcmpb $`0`, (VEC_SIZE * -`2`)(%rax), %VECMATCH, %k0
284	kmovd %k0, %ecx
285
286	testl %ecx, %ecx
287	jnz L(ret_vec_x1)
288
289	/ Used no matter what. /
290	vpcmpb $`0`, (VEC_SIZE * -`3`)(%rax), %VECMATCH, %k0
291	kmovd %k0, %ecx
292
293	cmpl $(VEC_SIZE * `3`), %edx
294	ja L(last_vec)
295
296	lzcntl %ecx, %ecx
297	subq $(VEC_SIZE * `2` + `1`), %rax
298	subq %rcx, %rax
299	cmpq %rax, %rdi
300	jbe L(ret_1)
301	xorl %eax, %eax
302	L(ret_1):
303	ret
304
305	.p2align `4`,, `6`
306	L(loop_end):
307	kmovd %k1, %ecx
308	notl %ecx
309	testl %ecx, %ecx
310	jnz L(ret_vec_x0_end)
311
312	vptestnmb %VEC(`2`), %VEC(`2`), %k0
313	kmovd %k0, %ecx
314	testl %ecx, %ecx
315	jnz L(ret_vec_x1_end)
316
317	kmovd %k2, %ecx
318	kmovd %k4, %esi
319	/ Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)*
320	then it won't affect the result in esi (VEC4). If ecx is non-zero
321	then CHAR in VEC3 and bsrq will use that position. /*
322	salq $`32`, %rcx
323	orq %rsi, %rcx
324	bsrq %rcx, %rcx
325	addq %rcx, %rax
326	ret
327	.p2align `4`,, `4`
328	L(ret_vec_x0_end):
329	addq $(VEC_SIZE), %rax
330	L(ret_vec_x1_end):
331	bsrl %ecx, %ecx
332	leaq (VEC_SIZE * `2`)(%rax, %rcx), %rax
333	ret
334
335	END(MEMRCHR)
336	#endif
337

Browse the source code of glibc/sysdeps/x86_64/multiarch/memrchr-evex.S