memchr.S source code [glibc/sysdeps/x86_64/memchr.S]

1	/ Copyright (C) 2011-2017 Free Software Foundation, Inc.*
2	Contributed by Intel Corporation.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ fast SSE2 version with using pmaxub and 64 byte loop /
22
23	.text
24	ENTRY(memchr)
25	movd %rsi, %xmm1
26	mov %rdi, %rcx
27
28	punpcklbw %xmm1, %xmm1
29	test %rdx, %rdx
30	jz L(return_null)
31	punpcklbw %xmm1, %xmm1
32
33	and $`63`, %rcx
34	pshufd $`0`, %xmm1, %xmm1
35
36	cmp $`48`, %rcx
37	ja L(crosscache)
38
39	movdqu (%rdi), %xmm0
40	pcmpeqb %xmm1, %xmm0
41	pmovmskb %xmm0, %eax
42	test %eax, %eax
43
44	jnz L(matches_1)
45	sub $`16`, %rdx
46	jbe L(return_null)
47	add $`16`, %rdi
48	and $`15`, %rcx
49	and $-`16`, %rdi
50	add %rcx, %rdx
51	sub $`64`, %rdx
52	jbe L(exit_loop)
53	jmp L(loop_prolog)
54
55	.p2align `4`
56	L(crosscache):
57	and $`15`, %rcx
58	and $-`16`, %rdi
59	movdqa (%rdi), %xmm0
60
61	pcmpeqb %xmm1, %xmm0
62	/ Check if there is a match. /
63	pmovmskb %xmm0, %eax
64	/ Remove the leading bytes. /
65	sar %cl, %eax
66	test %eax, %eax
67	je L(unaligned_no_match)
68	/ Check which byte is a match. /
69	bsf %eax, %eax
70
71	sub %rax, %rdx
72	jbe L(return_null)
73	add %rdi, %rax
74	add %rcx, %rax
75	ret
76
77	.p2align `4`
78	L(unaligned_no_match):
79	/ Calculate the last acceptable address and check for possible*
80	addition overflow by using satured math:
81	rdx = rcx + rdx
82	rdx \|= -(rdx < rcx) /*
83	add %rcx, %rdx
84	sbb %rax, %rax
85	or %rax, %rdx
86	sub $`16`, %rdx
87	jbe L(return_null)
88	add $`16`, %rdi
89	sub $`64`, %rdx
90	jbe L(exit_loop)
91
92	.p2align `4`
93	L(loop_prolog):
94	movdqa (%rdi), %xmm0
95	pcmpeqb %xmm1, %xmm0
96	pmovmskb %xmm0, %eax
97	test %eax, %eax
98	jnz L(matches)
99
100	movdqa `16`(%rdi), %xmm2
101	pcmpeqb %xmm1, %xmm2
102	pmovmskb %xmm2, %eax
103	test %eax, %eax
104	jnz L(matches16)
105
106	movdqa `32`(%rdi), %xmm3
107	pcmpeqb %xmm1, %xmm3
108	pmovmskb %xmm3, %eax
109	test %eax, %eax
110	jnz L(matches32)
111
112	movdqa `48`(%rdi), %xmm4
113	pcmpeqb %xmm1, %xmm4
114	add $`64`, %rdi
115	pmovmskb %xmm4, %eax
116	test %eax, %eax
117	jnz L(matches0)
118
119	test $`0x3f`, %rdi
120	jz L(align64_loop)
121
122	sub $`64`, %rdx
123	jbe L(exit_loop)
124
125	movdqa (%rdi), %xmm0
126	pcmpeqb %xmm1, %xmm0
127	pmovmskb %xmm0, %eax
128	test %eax, %eax
129	jnz L(matches)
130
131	movdqa `16`(%rdi), %xmm2
132	pcmpeqb %xmm1, %xmm2
133	pmovmskb %xmm2, %eax
134	test %eax, %eax
135	jnz L(matches16)
136
137	movdqa `32`(%rdi), %xmm3
138	pcmpeqb %xmm1, %xmm3
139	pmovmskb %xmm3, %eax
140	test %eax, %eax
141	jnz L(matches32)
142
143	movdqa `48`(%rdi), %xmm3
144	pcmpeqb %xmm1, %xmm3
145	pmovmskb %xmm3, %eax
146
147	add $`64`, %rdi
148	test %eax, %eax
149	jnz L(matches0)
150
151	mov %rdi, %rcx
152	and $-`64`, %rdi
153	and $`63`, %rcx
154	add %rcx, %rdx
155
156	.p2align `4`
157	L(align64_loop):
158	sub $`64`, %rdx
159	jbe L(exit_loop)
160	movdqa (%rdi), %xmm0
161	movdqa `16`(%rdi), %xmm2
162	movdqa `32`(%rdi), %xmm3
163	movdqa `48`(%rdi), %xmm4
164
165	pcmpeqb %xmm1, %xmm0
166	pcmpeqb %xmm1, %xmm2
167	pcmpeqb %xmm1, %xmm3
168	pcmpeqb %xmm1, %xmm4
169
170	pmaxub %xmm0, %xmm3
171	pmaxub %xmm2, %xmm4
172	pmaxub %xmm3, %xmm4
173	pmovmskb %xmm4, %eax
174
175	add $`64`, %rdi
176
177	test %eax, %eax
178	jz L(align64_loop)
179
180	sub $`64`, %rdi
181
182	pmovmskb %xmm0, %eax
183	test %eax, %eax
184	jnz L(matches)
185
186	pmovmskb %xmm2, %eax
187	test %eax, %eax
188	jnz L(matches16)
189
190	movdqa `32`(%rdi), %xmm3
191	pcmpeqb %xmm1, %xmm3
192
193	pcmpeqb `48`(%rdi), %xmm1
194	pmovmskb %xmm3, %eax
195	test %eax, %eax
196	jnz L(matches32)
197
198	pmovmskb %xmm1, %eax
199	bsf %eax, %eax
200	lea `48`(%rdi, %rax), %rax
201	ret
202
203	.p2align `4`
204	L(exit_loop):
205	add $`32`, %rdx
206	jle L(exit_loop_32)
207
208	movdqa (%rdi), %xmm0
209	pcmpeqb %xmm1, %xmm0
210	pmovmskb %xmm0, %eax
211	test %eax, %eax
212	jnz L(matches)
213
214	movdqa `16`(%rdi), %xmm2
215	pcmpeqb %xmm1, %xmm2
216	pmovmskb %xmm2, %eax
217	test %eax, %eax
218	jnz L(matches16)
219
220	movdqa `32`(%rdi), %xmm3
221	pcmpeqb %xmm1, %xmm3
222	pmovmskb %xmm3, %eax
223	test %eax, %eax
224	jnz L(matches32_1)
225	sub $`16`, %rdx
226	jle L(return_null)
227
228	pcmpeqb `48`(%rdi), %xmm1
229	pmovmskb %xmm1, %eax
230	test %eax, %eax
231	jnz L(matches48_1)
232	xor %rax, %rax
233	ret
234
235	.p2align `4`
236	L(exit_loop_32):
237	add $`32`, %rdx
238	movdqa (%rdi), %xmm0
239	pcmpeqb %xmm1, %xmm0
240	pmovmskb %xmm0, %eax
241	test %eax, %eax
242	jnz L(matches_1)
243	sub $`16`, %rdx
244	jbe L(return_null)
245
246	pcmpeqb `16`(%rdi), %xmm1
247	pmovmskb %xmm1, %eax
248	test %eax, %eax
249	jnz L(matches16_1)
250	xor %rax, %rax
251	ret
252
253	.p2align `4`
254	L(matches0):
255	bsf %eax, %eax
256	lea -`16`(%rax, %rdi), %rax
257	ret
258
259	.p2align `4`
260	L(matches):
261	bsf %eax, %eax
262	add %rdi, %rax
263	ret
264
265	.p2align `4`
266	L(matches16):
267	bsf %eax, %eax
268	lea `16`(%rax, %rdi), %rax
269	ret
270
271	.p2align `4`
272	L(matches32):
273	bsf %eax, %eax
274	lea `32`(%rax, %rdi), %rax
275	ret
276
277	.p2align `4`
278	L(matches_1):
279	bsf %eax, %eax
280	sub %rax, %rdx
281	jbe L(return_null)
282	add %rdi, %rax
283	ret
284
285	.p2align `4`
286	L(matches16_1):
287	bsf %eax, %eax
288	sub %rax, %rdx
289	jbe L(return_null)
290	lea `16`(%rdi, %rax), %rax
291	ret
292
293	.p2align `4`
294	L(matches32_1):
295	bsf %eax, %eax
296	sub %rax, %rdx
297	jbe L(return_null)
298	lea `32`(%rdi, %rax), %rax
299	ret
300
301	.p2align `4`
302	L(matches48_1):
303	bsf %eax, %eax
304	sub %rax, %rdx
305	jbe L(return_null)
306	lea `48`(%rdi, %rax), %rax
307	ret
308
309	.p2align `4`
310	L(return_null):
311	xor %rax, %rax
312	ret
313	END(memchr)
314
315	strong_alias (memchr, __memchr)
316
317	libc_hidden_builtin_def(memchr)
318

Browse the source code of glibc/sysdeps/x86_64/memchr.S