strchr-sse2.S source code [glibc/sysdeps/x86_64/multiarch/strchr-sse2.S]

1	/ strchr optimized with SSE2.*
2	Copyright (C) 2009-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	/ MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation*
22	so we need this to build for ISA V2 builds. /*
23	#if ISA_SHOULD_BUILD (2)
24
25	# ifndef STRCHR
26	# define STRCHR __strchr_sse2
27	# endif
28
29	# include <sysdep.h>
30
31	.text
32	ENTRY (STRCHR)
33	movd %esi, %xmm1
34	movl %edi, %eax
35	andl $`4095`, %eax
36	punpcklbw %xmm1, %xmm1
37	cmpl $`4032`, %eax
38	punpcklwd %xmm1, %xmm1
39	pshufd $`0`, %xmm1, %xmm1
40	jg L(cross_page)
41	movdqu (%rdi), %xmm0
42	pxor %xmm3, %xmm3
43	movdqa %xmm0, %xmm4
44	pcmpeqb %xmm1, %xmm0
45	pcmpeqb %xmm3, %xmm4
46	por %xmm4, %xmm0
47	pmovmskb %xmm0, %eax
48	test %eax, %eax
49	je L(next_48_bytes)
50	bsf %eax, %eax
51	# ifdef AS_STRCHRNUL
52	leaq (%rdi,%rax), %rax
53	# else
54	movl $`0`, %edx
55	leaq (%rdi,%rax), %rax
56	cmpb %sil, (%rax)
57	cmovne %rdx, %rax
58	# endif
59	ret
60
61	.p2align `3`
62	L(next_48_bytes):
63	movdqu `16`(%rdi), %xmm0
64	movdqa %xmm0, %xmm4
65	pcmpeqb %xmm1, %xmm0
66	pcmpeqb %xmm3, %xmm4
67	por %xmm4, %xmm0
68	pmovmskb %xmm0, %ecx
69	movdqu `32`(%rdi), %xmm0
70	movdqa %xmm0, %xmm4
71	pcmpeqb %xmm1, %xmm0
72	salq $`16`, %rcx
73	pcmpeqb %xmm3, %xmm4
74	por %xmm4, %xmm0
75	pmovmskb %xmm0, %eax
76	movdqu `48`(%rdi), %xmm0
77	pcmpeqb %xmm0, %xmm3
78	salq $`32`, %rax
79	pcmpeqb %xmm1, %xmm0
80	orq %rcx, %rax
81	por %xmm3, %xmm0
82	pmovmskb %xmm0, %ecx
83	salq $`48`, %rcx
84	orq %rcx, %rax
85	testq %rax, %rax
86	jne L(return)
87	L(loop_start):
88	/ We use this alignment to force loop be aligned to 8 but not*
89	16 bytes. This gives better scheduling on AMD processors. /*
90	.p2align `4`
91	pxor %xmm6, %xmm6
92	andq $-`64`, %rdi
93	.p2align `3`
94	L(loop64):
95	addq $`64`, %rdi
96	movdqa (%rdi), %xmm5
97	movdqa `16`(%rdi), %xmm2
98	movdqa `32`(%rdi), %xmm3
99	pxor %xmm1, %xmm5
100	movdqa `48`(%rdi), %xmm4
101	pxor %xmm1, %xmm2
102	pxor %xmm1, %xmm3
103	pminub (%rdi), %xmm5
104	pxor %xmm1, %xmm4
105	pminub `16`(%rdi), %xmm2
106	pminub `32`(%rdi), %xmm3
107	pminub %xmm2, %xmm5
108	pminub `48`(%rdi), %xmm4
109	pminub %xmm3, %xmm5
110	pminub %xmm4, %xmm5
111	pcmpeqb %xmm6, %xmm5
112	pmovmskb %xmm5, %eax
113
114	testl %eax, %eax
115	je L(loop64)
116
117	movdqa (%rdi), %xmm5
118	movdqa %xmm5, %xmm0
119	pcmpeqb %xmm1, %xmm5
120	pcmpeqb %xmm6, %xmm0
121	por %xmm0, %xmm5
122	pcmpeqb %xmm6, %xmm2
123	pcmpeqb %xmm6, %xmm3
124	pcmpeqb %xmm6, %xmm4
125
126	pmovmskb %xmm5, %ecx
127	pmovmskb %xmm2, %eax
128	salq $`16`, %rax
129	pmovmskb %xmm3, %r8d
130	pmovmskb %xmm4, %edx
131	salq $`32`, %r8
132	orq %r8, %rax
133	orq %rcx, %rax
134	salq $`48`, %rdx
135	orq %rdx, %rax
136	.p2align `3`
137	L(return):
138	bsfq %rax, %rax
139	# ifdef AS_STRCHRNUL
140	leaq (%rdi,%rax), %rax
141	# else
142	movl $`0`, %edx
143	leaq (%rdi,%rax), %rax
144	cmpb %sil, (%rax)
145	cmovne %rdx, %rax
146	# endif
147	ret
148	.p2align `4`
149
150	L(cross_page):
151	movq %rdi, %rdx
152	pxor %xmm2, %xmm2
153	andq $-`64`, %rdx
154	movdqa %xmm1, %xmm0
155	movdqa (%rdx), %xmm3
156	movdqa %xmm3, %xmm4
157	pcmpeqb %xmm1, %xmm3
158	pcmpeqb %xmm2, %xmm4
159	por %xmm4, %xmm3
160	pmovmskb %xmm3, %r8d
161	movdqa `16`(%rdx), %xmm3
162	movdqa %xmm3, %xmm4
163	pcmpeqb %xmm1, %xmm3
164	pcmpeqb %xmm2, %xmm4
165	por %xmm4, %xmm3
166	pmovmskb %xmm3, %eax
167	movdqa `32`(%rdx), %xmm3
168	movdqa %xmm3, %xmm4
169	pcmpeqb %xmm1, %xmm3
170	salq $`16`, %rax
171	pcmpeqb %xmm2, %xmm4
172	por %xmm4, %xmm3
173	pmovmskb %xmm3, %r9d
174	movdqa `48`(%rdx), %xmm3
175	pcmpeqb %xmm3, %xmm2
176	salq $`32`, %r9
177	pcmpeqb %xmm3, %xmm0
178	orq %r9, %rax
179	orq %r8, %rax
180	por %xmm2, %xmm0
181	pmovmskb %xmm0, %ecx
182	salq $`48`, %rcx
183	orq %rcx, %rax
184	movl %edi, %ecx
185	subb %dl, %cl
186	shrq %cl, %rax
187	testq %rax, %rax
188	jne L(return)
189	jmp L(loop_start)
190
191	END (STRCHR)
192	#endif
193

Browse the source code of glibc/sysdeps/x86_64/multiarch/strchr-sse2.S