memcmpeq-evex.S source code [glibc/sysdeps/x86_64/multiarch/memcmpeq-evex.S]

1	/ __memcmpeq optimized with EVEX.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	/ __memcmpeq is implemented as:*
24	1. Use ymm vector compares when possible. The only case where
25	vector compares is not possible for when size < VEC_SIZE
26	and loading from either s1 or s2 would cause a page cross.
27	2. Use xmm vector compare when size >= 8 bytes.
28	3. Optimistically compare up to first 4 VEC_SIZE one at a*
29	to check for early mismatches. Only do this if its guranteed the
30	work is not wasted.
31	4. If size is 8 VEC_SIZE or less, unroll the loop.*
32	5. Compare 4 VEC_SIZE at a time with the aligned first memory*
33	area.
34	6. Use 2 vector compares when size is 2 VEC_SIZE or less.*
35	7. Use 4 vector compares when size is 4 VEC_SIZE or less.*
36	8. Use 8 vector compares when size is 8 VEC_SIZE or less. /
37
38	# include <sysdep.h>
39
40	# ifndef MEMCMPEQ
41	# define MEMCMPEQ __memcmpeq_evex
42	# endif
43
44	# define VMOVU_MASK vmovdqu8
45	# define VMOVU vmovdqu64
46	# define VPCMP vpcmpub
47	# define VPTEST vptestmb
48
49	# define VEC_SIZE 32
50	# define PAGE_SIZE 4096
51
52	# define YMM0 ymm16
53	# define YMM1 ymm17
54	# define YMM2 ymm18
55	# define YMM3 ymm19
56	# define YMM4 ymm20
57	# define YMM5 ymm21
58	# define YMM6 ymm22
59
60
61	.section .text.evex, "ax", @progbits
62	ENTRY_P2ALIGN (MEMCMPEQ, `6`)
63	# ifdef __ILP32__
64	/ Clear the upper 32 bits. /
65	movl %edx, %edx
66	# endif
67	cmp $VEC_SIZE, %RDX_LP
68	/ Fall through for [0, VEC_SIZE] as its the hottest. /
69	ja L(more_1x_vec)
70
71	/ Create mask of bytes that are guranteed to be valid because*
72	of length (edx). Using masked movs allows us to skip checks for
73	page crosses/zero size. /*
74	movl $-`1`, %ecx
75	bzhil %edx, %ecx, %ecx
76	kmovd %ecx, %k2
77
78	/ Use masked loads as VEC_SIZE could page cross where length*
79	(edx) would not. /*
80	VMOVU_MASK (%rsi), %YMM2{%k2}
81	VPCMP $`4`,(%rdi), %YMM2, %k1{%k2}
82	kmovd %k1, %eax
83	ret
84
85
86	L(last_1x_vec):
87	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx), %YMM1
88	VPCMP $`4`, -(VEC_SIZE * `1`)(%rdi, %rdx), %YMM1, %k1
89	kmovd %k1, %eax
90	L(return_neq0):
91	ret
92
93
94
95	.p2align `4`
96	L(more_1x_vec):
97	/ From VEC + 1 to 2 * VEC. /
98	VMOVU (%rsi), %YMM1
99	/ Use compare not equals to directly check for mismatch. /
100	VPCMP $`4`,(%rdi), %YMM1, %k1
101	kmovd %k1, %eax
102	testl %eax, %eax
103	jnz L(return_neq0)
104
105	cmpq $(VEC_SIZE * `2`), %rdx
106	jbe L(last_1x_vec)
107
108	/ Check second VEC no matter what. /
109	VMOVU VEC_SIZE(%rsi), %YMM2
110	VPCMP $`4`, VEC_SIZE(%rdi), %YMM2, %k1
111	kmovd %k1, %eax
112	testl %eax, %eax
113	jnz L(return_neq0)
114
115	/ Less than 4 * VEC. /
116	cmpq $(VEC_SIZE * `4`), %rdx
117	jbe L(last_2x_vec)
118
119	/ Check third and fourth VEC no matter what. /
120	VMOVU (VEC_SIZE * `2`)(%rsi), %YMM3
121	VPCMP $`4`,(VEC_SIZE * `2`)(%rdi), %YMM3, %k1
122	kmovd %k1, %eax
123	testl %eax, %eax
124	jnz L(return_neq0)
125
126	VMOVU (VEC_SIZE * `3`)(%rsi), %YMM4
127	VPCMP $`4`,(VEC_SIZE * `3`)(%rdi), %YMM4, %k1
128	kmovd %k1, %eax
129	testl %eax, %eax
130	jnz L(return_neq0)
131
132	/ Go to 4x VEC loop. /
133	cmpq $(VEC_SIZE * `8`), %rdx
134	ja L(more_8x_vec)
135
136	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
137	branches. /*
138
139	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %YMM1
140	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %YMM2
141	addq %rdx, %rdi
142
143	/ Wait to load from s1 until addressed adjust due to*
144	unlamination. /*
145
146	/ vpxor will be all 0s if s1 and s2 are equal. Otherwise it*
147	will have some 1s. /*
148	vpxorq -(VEC_SIZE * `4`)(%rdi), %YMM1, %YMM1
149	/ Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while*
150	oring with YMM1. Result is stored in YMM1. /*
151	vpternlogd $`0xde`, -(VEC_SIZE * `3`)(%rdi), %YMM1, %YMM2
152
153	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %YMM3
154	vpxorq -(VEC_SIZE * `2`)(%rdi), %YMM3, %YMM3
155	/ Or together YMM1, YMM2, and YMM3 into YMM3. /
156	VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
157	vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
158
159	/ Or together YMM2, YMM3, and YMM4 into YMM4. /
160	vpternlogd $`0xfe`, %YMM2, %YMM3, %YMM4
161
162	/ Compare YMM4 with 0. If any 1s s1 and s2 don't match. /
163	VPTEST %YMM4, %YMM4, %k1
164	kmovd %k1, %eax
165	ret
166
167	.p2align `4`
168	L(more_8x_vec):
169	/ Set end of s1 in rdx. /
170	leaq -(VEC_SIZE * `4`)(%rdi, %rdx), %rdx
171	/ rsi stores s2 - s1. This allows loop to only update one*
172	pointer. /*
173	subq %rdi, %rsi
174	/ Align s1 pointer. /
175	andq $-VEC_SIZE, %rdi
176	/ Adjust because first 4x vec where check already. /
177	subq $-(VEC_SIZE * `4`), %rdi
178	.p2align `4`
179	L(loop_4x_vec):
180	VMOVU (%rsi, %rdi), %YMM1
181	vpxorq (%rdi), %YMM1, %YMM1
182
183	VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
184	vpternlogd $`0xde`,(VEC_SIZE)(%rdi), %YMM1, %YMM2
185
186	VMOVU (VEC_SIZE * `2`)(%rsi, %rdi), %YMM3
187	vpxorq (VEC_SIZE * `2`)(%rdi), %YMM3, %YMM3
188
189	VMOVU (VEC_SIZE * `3`)(%rsi, %rdi), %YMM4
190	vpxorq (VEC_SIZE * `3`)(%rdi), %YMM4, %YMM4
191
192	vpternlogd $`0xfe`, %YMM2, %YMM3, %YMM4
193	VPTEST %YMM4, %YMM4, %k1
194	kmovd %k1, %eax
195	testl %eax, %eax
196	jnz L(return_neq2)
197	subq $-(VEC_SIZE * `4`), %rdi
198	cmpq %rdx, %rdi
199	jb L(loop_4x_vec)
200
201	subq %rdx, %rdi
202	VMOVU (VEC_SIZE * `3`)(%rsi, %rdx), %YMM4
203	vpxorq (VEC_SIZE * `3`)(%rdx), %YMM4, %YMM4
204	/ rdi has 4 * VEC_SIZE - remaining length. /
205	cmpl $(VEC_SIZE * `3`), %edi
206	jae L(`8x_last_1x_vec`)
207	/ Load regardless of branch. /
208	VMOVU (VEC_SIZE * `2`)(%rsi, %rdx), %YMM3
209	/ Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while*
210	oring with YMM4. Result is stored in YMM4. /*
211	vpternlogd $`0xf6`,(VEC_SIZE * `2`)(%rdx), %YMM3, %YMM4
212	cmpl $(VEC_SIZE * `2`), %edi
213	jae L(`8x_last_2x_vec`)
214
215	VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
216	vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
217
218	VMOVU (%rsi, %rdx), %YMM1
219	vpxorq (%rdx), %YMM1, %YMM1
220
221	vpternlogd $`0xfe`, %YMM1, %YMM2, %YMM4
222	L(`8x_last_1x_vec`):
223	L(`8x_last_2x_vec`):
224	VPTEST %YMM4, %YMM4, %k1
225	kmovd %k1, %eax
226	L(return_neq2):
227	ret
228
229	.p2align `4`,, `8`
230	L(last_2x_vec):
231	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %YMM1
232	vpxorq -(VEC_SIZE * `2`)(%rdi, %rdx), %YMM1, %YMM1
233	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx), %YMM2
234	vpternlogd $`0xde`, -(VEC_SIZE * `1`)(%rdi, %rdx), %YMM1, %YMM2
235	VPTEST %YMM2, %YMM2, %k1
236	kmovd %k1, %eax
237	ret
238
239	/ 1 Bytes from next cache line. /
240	END (MEMCMPEQ)
241	#endif
242

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcmpeq-evex.S