memcmpeq-evex.S source code [glibc/sysdeps/x86_64/multiarch/memcmpeq-evex.S]

1	/ __memcmpeq optimized with EVEX.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	/ __memcmpeq is implemented as:*
22	1. Use ymm vector compares when possible. The only case where
23	vector compares is not possible for when size < VEC_SIZE
24	and loading from either s1 or s2 would cause a page cross.
25	2. Use xmm vector compare when size >= 8 bytes.
26	3. Optimistically compare up to first 4 VEC_SIZE one at a*
27	to check for early mismatches. Only do this if its guranteed the
28	work is not wasted.
29	4. If size is 8 VEC_SIZE or less, unroll the loop.*
30	5. Compare 4 VEC_SIZE at a time with the aligned first memory*
31	area.
32	6. Use 2 vector compares when size is 2 VEC_SIZE or less.*
33	7. Use 4 vector compares when size is 4 VEC_SIZE or less.*
34	8. Use 8 vector compares when size is 8 VEC_SIZE or less. /
35
36	# include <sysdep.h>
37
38	# ifndef MEMCMPEQ
39	# define MEMCMPEQ __memcmpeq_evex
40	# endif
41
42	# define VMOVU_MASK vmovdqu8
43	# define VMOVU vmovdqu64
44	# define VPCMP vpcmpub
45	# define VPTEST vptestmb
46
47	# define VEC_SIZE 32
48	# define PAGE_SIZE 4096
49
50	# define YMM0 ymm16
51	# define YMM1 ymm17
52	# define YMM2 ymm18
53	# define YMM3 ymm19
54	# define YMM4 ymm20
55	# define YMM5 ymm21
56	# define YMM6 ymm22
57
58
59	.section .text.evex, "ax", @progbits
60	ENTRY_P2ALIGN (MEMCMPEQ, `6`)
61	# ifdef __ILP32__
62	/ Clear the upper 32 bits. /
63	movl %edx, %edx
64	# endif
65	cmp $VEC_SIZE, %RDX_LP
66	/ Fall through for [0, VEC_SIZE] as its the hottest. /
67	ja L(more_1x_vec)
68
69	/ Create mask of bytes that are guranteed to be valid because*
70	of length (edx). Using masked movs allows us to skip checks for
71	page crosses/zero size. /*
72	movl $-`1`, %ecx
73	bzhil %edx, %ecx, %ecx
74	kmovd %ecx, %k2
75
76	/ Use masked loads as VEC_SIZE could page cross where length*
77	(edx) would not. /*
78	VMOVU_MASK (%rsi), %YMM2{%k2}
79	VPCMP $`4`,(%rdi), %YMM2, %k1{%k2}
80	kmovd %k1, %eax
81	ret
82
83
84	L(last_1x_vec):
85	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx), %YMM1
86	VPCMP $`4`, -(VEC_SIZE * `1`)(%rdi, %rdx), %YMM1, %k1
87	kmovd %k1, %eax
88	L(return_neq0):
89	ret
90
91
92
93	.p2align `4`
94	L(more_1x_vec):
95	/ From VEC + 1 to 2 * VEC. /
96	VMOVU (%rsi), %YMM1
97	/ Use compare not equals to directly check for mismatch. /
98	VPCMP $`4`,(%rdi), %YMM1, %k1
99	kmovd %k1, %eax
100	testl %eax, %eax
101	jnz L(return_neq0)
102
103	cmpq $(VEC_SIZE * `2`), %rdx
104	jbe L(last_1x_vec)
105
106	/ Check second VEC no matter what. /
107	VMOVU VEC_SIZE(%rsi), %YMM2
108	VPCMP $`4`, VEC_SIZE(%rdi), %YMM2, %k1
109	kmovd %k1, %eax
110	testl %eax, %eax
111	jnz L(return_neq0)
112
113	/ Less than 4 * VEC. /
114	cmpq $(VEC_SIZE * `4`), %rdx
115	jbe L(last_2x_vec)
116
117	/ Check third and fourth VEC no matter what. /
118	VMOVU (VEC_SIZE * `2`)(%rsi), %YMM3
119	VPCMP $`4`,(VEC_SIZE * `2`)(%rdi), %YMM3, %k1
120	kmovd %k1, %eax
121	testl %eax, %eax
122	jnz L(return_neq0)
123
124	VMOVU (VEC_SIZE * `3`)(%rsi), %YMM4
125	VPCMP $`4`,(VEC_SIZE * `3`)(%rdi), %YMM4, %k1
126	kmovd %k1, %eax
127	testl %eax, %eax
128	jnz L(return_neq0)
129
130	/ Go to 4x VEC loop. /
131	cmpq $(VEC_SIZE * `8`), %rdx
132	ja L(more_8x_vec)
133
134	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
135	branches. /*
136
137	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %YMM1
138	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %YMM2
139	addq %rdx, %rdi
140
141	/ Wait to load from s1 until addressed adjust due to*
142	unlamination. /*
143
144	/ vpxor will be all 0s if s1 and s2 are equal. Otherwise it*
145	will have some 1s. /*
146	vpxorq -(VEC_SIZE * `4`)(%rdi), %YMM1, %YMM1
147	/ Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while*
148	oring with YMM1. Result is stored in YMM1. /*
149	vpternlogd $`0xde`, -(VEC_SIZE * `3`)(%rdi), %YMM1, %YMM2
150
151	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %YMM3
152	vpxorq -(VEC_SIZE * `2`)(%rdi), %YMM3, %YMM3
153	/ Or together YMM1, YMM2, and YMM3 into YMM3. /
154	VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
155	vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
156
157	/ Or together YMM2, YMM3, and YMM4 into YMM4. /
158	vpternlogd $`0xfe`, %YMM2, %YMM3, %YMM4
159
160	/ Compare YMM4 with 0. If any 1s s1 and s2 don't match. /
161	VPTEST %YMM4, %YMM4, %k1
162	kmovd %k1, %eax
163	ret
164
165	.p2align `4`
166	L(more_8x_vec):
167	/ Set end of s1 in rdx. /
168	leaq -(VEC_SIZE * `4`)(%rdi, %rdx), %rdx
169	/ rsi stores s2 - s1. This allows loop to only update one*
170	pointer. /*
171	subq %rdi, %rsi
172	/ Align s1 pointer. /
173	andq $-VEC_SIZE, %rdi
174	/ Adjust because first 4x vec where check already. /
175	subq $-(VEC_SIZE * `4`), %rdi
176	.p2align `4`
177	L(loop_4x_vec):
178	VMOVU (%rsi, %rdi), %YMM1
179	vpxorq (%rdi), %YMM1, %YMM1
180
181	VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
182	vpternlogd $`0xde`,(VEC_SIZE)(%rdi), %YMM1, %YMM2
183
184	VMOVU (VEC_SIZE * `2`)(%rsi, %rdi), %YMM3
185	vpxorq (VEC_SIZE * `2`)(%rdi), %YMM3, %YMM3
186
187	VMOVU (VEC_SIZE * `3`)(%rsi, %rdi), %YMM4
188	vpxorq (VEC_SIZE * `3`)(%rdi), %YMM4, %YMM4
189
190	vpternlogd $`0xfe`, %YMM2, %YMM3, %YMM4
191	VPTEST %YMM4, %YMM4, %k1
192	kmovd %k1, %eax
193	testl %eax, %eax
194	jnz L(return_neq2)
195	subq $-(VEC_SIZE * `4`), %rdi
196	cmpq %rdx, %rdi
197	jb L(loop_4x_vec)
198
199	subq %rdx, %rdi
200	VMOVU (VEC_SIZE * `3`)(%rsi, %rdx), %YMM4
201	vpxorq (VEC_SIZE * `3`)(%rdx), %YMM4, %YMM4
202	/ rdi has 4 * VEC_SIZE - remaining length. /
203	cmpl $(VEC_SIZE * `3`), %edi
204	jae L(`8x_last_1x_vec`)
205	/ Load regardless of branch. /
206	VMOVU (VEC_SIZE * `2`)(%rsi, %rdx), %YMM3
207	/ Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while*
208	oring with YMM4. Result is stored in YMM4. /*
209	vpternlogd $`0xf6`,(VEC_SIZE * `2`)(%rdx), %YMM3, %YMM4
210	cmpl $(VEC_SIZE * `2`), %edi
211	jae L(`8x_last_2x_vec`)
212
213	VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
214	vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
215
216	VMOVU (%rsi, %rdx), %YMM1
217	vpxorq (%rdx), %YMM1, %YMM1
218
219	vpternlogd $`0xfe`, %YMM1, %YMM2, %YMM4
220	L(`8x_last_1x_vec`):
221	L(`8x_last_2x_vec`):
222	VPTEST %YMM4, %YMM4, %k1
223	kmovd %k1, %eax
224	L(return_neq2):
225	ret
226
227	.p2align `4`,, `8`
228	L(last_2x_vec):
229	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %YMM1
230	vpxorq -(VEC_SIZE * `2`)(%rdi, %rdx), %YMM1, %YMM1
231	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx), %YMM2
232	vpternlogd $`0xde`, -(VEC_SIZE * `1`)(%rdi, %rdx), %YMM1, %YMM2
233	VPTEST %YMM2, %YMM2, %k1
234	kmovd %k1, %eax
235	ret
236
237	/ 1 Bytes from next cache line. /
238	END (MEMCMPEQ)
239	#endif
240

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcmpeq-evex.S