memcmp-evex-movbe.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S]

1	/ memcmp/wmemcmp optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	/ memcmp/wmemcmp is implemented as:*
22	1. Use ymm vector compares when possible. The only case where
23	vector compares is not possible for when size < CHAR_PER_VEC
24	and loading from either s1 or s2 would cause a page cross.
25	2. For size from 2 to 7 bytes on page cross, load as big endian
26	with movbe and bswap to avoid branches.
27	3. Use xmm vector compare when size >= 4 bytes for memcmp or
28	size >= 8 bytes for wmemcmp.
29	4. Optimistically compare up to first 4 CHAR_PER_VEC one at a*
30	to check for early mismatches. Only do this if its guranteed the
31	work is not wasted.
32	5. If size is 8 VEC_SIZE or less, unroll the loop.*
33	6. Compare 4 VEC_SIZE at a time with the aligned first memory*
34	area.
35	7. Use 2 vector compares when size is 2 CHAR_PER_VEC or less.*
36	8. Use 4 vector compares when size is 4 CHAR_PER_VEC or less.*
37	9. Use 8 vector compares when size is 8 CHAR_PER_VEC or less. /
38
39	# include <sysdep.h>
40
41	# ifndef MEMCMP
42	# define MEMCMP __memcmp_evex_movbe
43	# endif
44
45	# define VMOVU vmovdqu64
46
47	# ifdef USE_AS_WMEMCMP
48	# define CHAR_SIZE 4
49	# define VPCMP vpcmpd
50	# else
51	# define CHAR_SIZE 1
52	# define VPCMP vpcmpub
53	# endif
54
55	# define VEC_SIZE 32
56	# define PAGE_SIZE 4096
57	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
58
59	# define XMM0 xmm16
60	# define XMM1 xmm17
61	# define XMM2 xmm18
62	# define YMM0 ymm16
63	# define XMM1 xmm17
64	# define XMM2 xmm18
65	# define YMM1 ymm17
66	# define YMM2 ymm18
67	# define YMM3 ymm19
68	# define YMM4 ymm20
69	# define YMM5 ymm21
70	# define YMM6 ymm22
71
72	/ Warning!*
73	wmemcmp has to use SIGNED comparison for elements.
74	memcmp has to use UNSIGNED comparison for elemnts.
75	*/
76
77	.section .text.evex,"ax",@progbits
78	ENTRY (MEMCMP)
79	# ifdef __ILP32__
80	/ Clear the upper 32 bits. /
81	movl %edx, %edx
82	# endif
83	cmp $CHAR_PER_VEC, %RDX_LP
84	jb L(less_vec)
85
86	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
87	VMOVU (%rsi), %YMM1
88	/ Use compare not equals to directly check for mismatch. /
89	VPCMP $`4`, (%rdi), %YMM1, %k1
90	kmovd %k1, %eax
91	/ NB: eax must be destination register if going to*
92	L(return_vec_[0,2]). For L(return_vec_3 destination register
93	must be ecx. /*
94	testl %eax, %eax
95	jnz L(return_vec_0)
96
97	cmpq $(CHAR_PER_VEC * `2`), %rdx
98	jbe L(last_1x_vec)
99
100	/ Check second VEC no matter what. /
101	VMOVU VEC_SIZE(%rsi), %YMM2
102	VPCMP $`4`, VEC_SIZE(%rdi), %YMM2, %k1
103	kmovd %k1, %eax
104	testl %eax, %eax
105	jnz L(return_vec_1)
106
107	/ Less than 4 * VEC. /
108	cmpq $(CHAR_PER_VEC * `4`), %rdx
109	jbe L(last_2x_vec)
110
111	/ Check third and fourth VEC no matter what. /
112	VMOVU (VEC_SIZE * `2`)(%rsi), %YMM3
113	VPCMP $`4`, (VEC_SIZE * `2`)(%rdi), %YMM3, %k1
114	kmovd %k1, %eax
115	testl %eax, %eax
116	jnz L(return_vec_2)
117
118	VMOVU (VEC_SIZE * `3`)(%rsi), %YMM4
119	VPCMP $`4`, (VEC_SIZE * `3`)(%rdi), %YMM4, %k1
120	kmovd %k1, %ecx
121	testl %ecx, %ecx
122	jnz L(return_vec_3)
123
124	/ Zero YMM0. 4x VEC reduction is done with vpxor + vtern so*
125	compare with zero to get a mask is needed. /*
126	vpxorq %XMM0, %XMM0, %XMM0
127
128	/ Go to 4x VEC loop. /
129	cmpq $(CHAR_PER_VEC * `8`), %rdx
130	ja L(more_8x_vec)
131
132	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
133	branches. /*
134
135	/ Load first two VEC from s2 before adjusting addresses. /
136	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx, CHAR_SIZE), %YMM1
137	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx, CHAR_SIZE), %YMM2
138	leaq -(`4` * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
139	leaq -(`4` * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
140
141	/ Wait to load from s1 until addressed adjust due to*
142	unlamination of microfusion with complex address mode. /*
143
144	/ vpxor will be all 0s if s1 and s2 are equal. Otherwise it*
145	will have some 1s. /*
146	vpxorq (%rdi), %YMM1, %YMM1
147	vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
148
149	VMOVU (VEC_SIZE * `2`)(%rsi), %YMM3
150	vpxorq (VEC_SIZE * `2`)(%rdi), %YMM3, %YMM3
151	/ Or together YMM1, YMM2, and YMM3 into YMM3. /
152	vpternlogd $`0xfe`, %YMM1, %YMM2, %YMM3
153
154	VMOVU (VEC_SIZE * `3`)(%rsi), %YMM4
155	/ Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while*
156	oring with YMM3. Result is stored in YMM4. /*
157	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rdi), %YMM3, %YMM4
158	/ Compare YMM4 with 0. If any 1s s1 and s2 don't match. /
159	VPCMP $`4`, %YMM4, %YMM0, %k1
160	kmovd %k1, %ecx
161	testl %ecx, %ecx
162	jnz L(return_vec_0_1_2_3)
163	/ NB: eax must be zero to reach here. /
164	ret
165
166	/ NB: aligning 32 here allows for the rest of the jump targets*
167	to be tuned for 32 byte alignment. Most important this ensures
168	the L(more_8x_vec) loop is 32 byte aligned. /*
169	.p2align `5`
170	L(less_vec):
171	/ Check if one or less CHAR. This is necessary for size = 0 but*
172	is also faster for size = CHAR_SIZE. /*
173	cmpl $`1`, %edx
174	jbe L(one_or_less)
175
176	/ Check if loading one VEC from either s1 or s2 could cause a*
177	page cross. This can have false positives but is by far the
178	fastest method. /*
179	movl %edi, %eax
180	orl %esi, %eax
181	andl $(PAGE_SIZE - `1`), %eax
182	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
183	jg L(page_cross_less_vec)
184
185	/ No page cross possible. /
186	VMOVU (%rsi), %YMM2
187	VPCMP $`4`, (%rdi), %YMM2, %k1
188	kmovd %k1, %eax
189	/ Create mask in ecx for potentially in bound matches. /
190	bzhil %edx, %eax, %eax
191	jnz L(return_vec_0)
192	ret
193
194	.p2align `4`
195	L(return_vec_0):
196	tzcntl %eax, %eax
197	# ifdef USE_AS_WMEMCMP
198	movl (%rdi, %rax, CHAR_SIZE), %ecx
199	xorl %edx, %edx
200	cmpl (%rsi, %rax, CHAR_SIZE), %ecx
201	/ NB: no partial register stall here because xorl zero idiom*
202	above. /*
203	setg %dl
204	leal -`1`(%rdx, %rdx), %eax
205	# else
206	movzbl (%rsi, %rax), %ecx
207	movzbl (%rdi, %rax), %eax
208	subl %ecx, %eax
209	# endif
210	ret
211
212	/ NB: No p2align necessary. Alignment % 16 is naturally 1*
213	which is good enough for a target not in a loop. /*
214	L(return_vec_1):
215	tzcntl %eax, %eax
216	# ifdef USE_AS_WMEMCMP
217	movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
218	xorl %edx, %edx
219	cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
220	setg %dl
221	leal -`1`(%rdx, %rdx), %eax
222	# else
223	movzbl VEC_SIZE(%rsi, %rax), %ecx
224	movzbl VEC_SIZE(%rdi, %rax), %eax
225	subl %ecx, %eax
226	# endif
227	ret
228
229	/ NB: No p2align necessary. Alignment % 16 is naturally 2*
230	which is good enough for a target not in a loop. /*
231	L(return_vec_2):
232	tzcntl %eax, %eax
233	# ifdef USE_AS_WMEMCMP
234	movl (VEC_SIZE * `2`)(%rdi, %rax, CHAR_SIZE), %ecx
235	xorl %edx, %edx
236	cmpl (VEC_SIZE * `2`)(%rsi, %rax, CHAR_SIZE), %ecx
237	setg %dl
238	leal -`1`(%rdx, %rdx), %eax
239	# else
240	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
241	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
242	subl %ecx, %eax
243	# endif
244	ret
245
246	.p2align `4`
247	L(`8x_return_vec_0_1_2_3`):
248	/ Returning from L(more_8x_vec) requires restoring rsi. /
249	addq %rdi, %rsi
250	L(return_vec_0_1_2_3):
251	VPCMP $`4`, %YMM1, %YMM0, %k0
252	kmovd %k0, %eax
253	testl %eax, %eax
254	jnz L(return_vec_0)
255
256	VPCMP $`4`, %YMM2, %YMM0, %k0
257	kmovd %k0, %eax
258	testl %eax, %eax
259	jnz L(return_vec_1)
260
261	VPCMP $`4`, %YMM3, %YMM0, %k0
262	kmovd %k0, %eax
263	testl %eax, %eax
264	jnz L(return_vec_2)
265	L(return_vec_3):
266	tzcntl %ecx, %ecx
267	# ifdef USE_AS_WMEMCMP
268	movl (VEC_SIZE * `3`)(%rdi, %rcx, CHAR_SIZE), %eax
269	xorl %edx, %edx
270	cmpl (VEC_SIZE * `3`)(%rsi, %rcx, CHAR_SIZE), %eax
271	setg %dl
272	leal -`1`(%rdx, %rdx), %eax
273	# else
274	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
275	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %ecx
276	subl %ecx, %eax
277	# endif
278	ret
279
280	.p2align `4`
281	L(more_8x_vec):
282	/ Set end of s1 in rdx. /
283	leaq -(VEC_SIZE * `4`)(%rdi, %rdx, CHAR_SIZE), %rdx
284	/ rsi stores s2 - s1. This allows loop to only update one*
285	pointer. /*
286	subq %rdi, %rsi
287	/ Align s1 pointer. /
288	andq $-VEC_SIZE, %rdi
289	/ Adjust because first 4x vec where check already. /
290	subq $-(VEC_SIZE * `4`), %rdi
291	.p2align `4`
292	L(loop_4x_vec):
293	VMOVU (%rsi, %rdi), %YMM1
294	vpxorq (%rdi), %YMM1, %YMM1
295
296	VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
297	vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
298
299	VMOVU (VEC_SIZE * `2`)(%rsi, %rdi), %YMM3
300	vpxorq (VEC_SIZE * `2`)(%rdi), %YMM3, %YMM3
301	vpternlogd $`0xfe`, %YMM1, %YMM2, %YMM3
302
303	VMOVU (VEC_SIZE * `3`)(%rsi, %rdi), %YMM4
304	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rdi), %YMM3, %YMM4
305	VPCMP $`4`, %YMM4, %YMM0, %k1
306	kmovd %k1, %ecx
307	testl %ecx, %ecx
308	jnz L(`8x_return_vec_0_1_2_3`)
309	subq $-(VEC_SIZE * `4`), %rdi
310	cmpq %rdx, %rdi
311	jb L(loop_4x_vec)
312
313	subq %rdx, %rdi
314	/ rdi has 4 * VEC_SIZE - remaining length. /
315	cmpl $(VEC_SIZE * `3`), %edi
316	jae L(`8x_last_1x_vec`)
317	/ Load regardless of branch. /
318	VMOVU (VEC_SIZE * `2`)(%rsi, %rdx), %YMM3
319	cmpl $(VEC_SIZE * `2`), %edi
320	jae L(`8x_last_2x_vec`)
321
322	VMOVU (%rsi, %rdx), %YMM1
323	vpxorq (%rdx), %YMM1, %YMM1
324
325	VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
326	vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
327
328	vpxorq (VEC_SIZE * `2`)(%rdx), %YMM3, %YMM3
329	vpternlogd $`0xfe`, %YMM1, %YMM2, %YMM3
330
331	VMOVU (VEC_SIZE * `3`)(%rsi, %rdx), %YMM4
332	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rdx), %YMM3, %YMM4
333	VPCMP $`4`, %YMM4, %YMM0, %k1
334	kmovd %k1, %ecx
335	/ Restore s1 pointer to rdi. /
336	movq %rdx, %rdi
337	testl %ecx, %ecx
338	jnz L(`8x_return_vec_0_1_2_3`)
339	/ NB: eax must be zero to reach here. /
340	ret
341
342	/ Only entry is from L(more_8x_vec). /
343	.p2align `4`
344	L(`8x_last_2x_vec`):
345	VPCMP $`4`, (VEC_SIZE * `2`)(%rdx), %YMM3, %k1
346	kmovd %k1, %eax
347	testl %eax, %eax
348	jnz L(`8x_return_vec_2`)
349	/ Naturally aligned to 16 bytes. /
350	L(`8x_last_1x_vec`):
351	VMOVU (VEC_SIZE * `3`)(%rsi, %rdx), %YMM1
352	VPCMP $`4`, (VEC_SIZE * `3`)(%rdx), %YMM1, %k1
353	kmovd %k1, %eax
354	testl %eax, %eax
355	jnz L(`8x_return_vec_3`)
356	ret
357
358	.p2align `4`
359	L(last_2x_vec):
360	/ Check second to last VEC. /
361	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx, CHAR_SIZE), %YMM1
362	VPCMP $`4`, -(VEC_SIZE * `2`)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
363	kmovd %k1, %eax
364	testl %eax, %eax
365	jnz L(return_vec_1_end)
366
367	/ Check last VEC. /
368	.p2align `4`
369	L(last_1x_vec):
370	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx, CHAR_SIZE), %YMM1
371	VPCMP $`4`, -(VEC_SIZE * `1`)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
372	kmovd %k1, %eax
373	testl %eax, %eax
374	jnz L(return_vec_0_end)
375	ret
376
377	.p2align `4`
378	L(`8x_return_vec_2`):
379	subq $VEC_SIZE, %rdx
380	L(`8x_return_vec_3`):
381	tzcntl %eax, %eax
382	# ifdef USE_AS_WMEMCMP
383	leaq (%rdx, %rax, CHAR_SIZE), %rax
384	movl (VEC_SIZE * `3`)(%rax), %ecx
385	xorl %edx, %edx
386	cmpl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
387	setg %dl
388	leal -`1`(%rdx, %rdx), %eax
389	# else
390	addq %rdx, %rax
391	movzbl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
392	movzbl (VEC_SIZE * `3`)(%rax), %eax
393	subl %ecx, %eax
394	# endif
395	ret
396
397	.p2align `4`
398	L(return_vec_0_end):
399	tzcntl %eax, %eax
400	addl %edx, %eax
401	# ifdef USE_AS_WMEMCMP
402	movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
403	xorl %edx, %edx
404	cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
405	setg %dl
406	leal -`1`(%rdx, %rdx), %eax
407	# else
408	movzbl -VEC_SIZE(%rsi, %rax), %ecx
409	movzbl -VEC_SIZE(%rdi, %rax), %eax
410	subl %ecx, %eax
411	# endif
412	ret
413
414	.p2align `4`
415	L(return_vec_1_end):
416	tzcntl %eax, %eax
417	addl %edx, %eax
418	# ifdef USE_AS_WMEMCMP
419	movl -(VEC_SIZE * `2`)(%rdi, %rax, CHAR_SIZE), %ecx
420	xorl %edx, %edx
421	cmpl -(VEC_SIZE * `2`)(%rsi, %rax, CHAR_SIZE), %ecx
422	setg %dl
423	leal -`1`(%rdx, %rdx), %eax
424	# else
425	movzbl -(VEC_SIZE * `2`)(%rsi, %rax), %ecx
426	movzbl -(VEC_SIZE * `2`)(%rdi, %rax), %eax
427	subl %ecx, %eax
428	# endif
429	ret
430
431
432	.p2align `4`
433	L(page_cross_less_vec):
434	/ if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28*
435	bytes. /*
436	cmpl $(`16` / CHAR_SIZE), %edx
437	jae L(between_16_31)
438	# ifndef USE_AS_WMEMCMP
439	cmpl $`8`, %edx
440	jae L(between_8_15)
441	cmpl $`4`, %edx
442	jae L(between_4_7)
443	L(between_2_3):
444	/ Load as big endian to avoid branches. /
445	movzwl (%rdi), %eax
446	movzwl (%rsi), %ecx
447	shll $`8`, %eax
448	shll $`8`, %ecx
449	bswap %eax
450	bswap %ecx
451	movzbl -`1`(%rdi, %rdx), %edi
452	movzbl -`1`(%rsi, %rdx), %esi
453	orl %edi, %eax
454	orl %esi, %ecx
455	/ Subtraction is okay because the upper 8 bits are zero. /
456	subl %ecx, %eax
457	ret
458	.p2align `4`
459	L(one_or_less):
460	jb L(zero)
461	movzbl (%rsi), %ecx
462	movzbl (%rdi), %eax
463	subl %ecx, %eax
464	ret
465
466	.p2align `4`
467	L(between_8_15):
468	# endif
469	/ If USE_AS_WMEMCMP fall through into 8-15 byte case. /
470	vmovq (%rdi), %XMM1
471	vmovq (%rsi), %XMM2
472	VPCMP $`4`, %XMM1, %XMM2, %k1
473	kmovd %k1, %eax
474	testl %eax, %eax
475	jnz L(return_vec_0)
476	/ Use overlapping loads to avoid branches. /
477	leaq -`8`(%rdi, %rdx, CHAR_SIZE), %rdi
478	leaq -`8`(%rsi, %rdx, CHAR_SIZE), %rsi
479	vmovq (%rdi), %XMM1
480	vmovq (%rsi), %XMM2
481	VPCMP $`4`, %XMM1, %XMM2, %k1
482	kmovd %k1, %eax
483	testl %eax, %eax
484	jnz L(return_vec_0)
485	ret
486
487	.p2align `4`
488	L(zero):
489	xorl %eax, %eax
490	ret
491
492	.p2align `4`
493	L(between_16_31):
494	/ From 16 to 31 bytes. No branch when size == 16. /
495	VMOVU (%rsi), %XMM2
496	VPCMP $`4`, (%rdi), %XMM2, %k1
497	kmovd %k1, %eax
498	testl %eax, %eax
499	jnz L(return_vec_0)
500
501	/ Use overlapping loads to avoid branches. /
502
503	VMOVU -`16`(%rsi, %rdx, CHAR_SIZE), %XMM2
504	leaq -`16`(%rdi, %rdx, CHAR_SIZE), %rdi
505	leaq -`16`(%rsi, %rdx, CHAR_SIZE), %rsi
506	VPCMP $`4`, (%rdi), %XMM2, %k1
507	kmovd %k1, %eax
508	testl %eax, %eax
509	jnz L(return_vec_0)
510	ret
511
512	# ifdef USE_AS_WMEMCMP
513	.p2align `4`
514	L(one_or_less):
515	jb L(zero)
516	movl (%rdi), %ecx
517	xorl %edx, %edx
518	cmpl (%rsi), %ecx
519	je L(zero)
520	setg %dl
521	leal -`1`(%rdx, %rdx), %eax
522	ret
523	# else
524
525	.p2align `4`
526	L(between_4_7):
527	/ Load as big endian with overlapping movbe to avoid branches.*
528	*/
529	movbe (%rdi), %eax
530	movbe (%rsi), %ecx
531	shlq $`32`, %rax
532	shlq $`32`, %rcx
533	movbe -`4`(%rdi, %rdx), %edi
534	movbe -`4`(%rsi, %rdx), %esi
535	orq %rdi, %rax
536	orq %rsi, %rcx
537	subq %rcx, %rax
538	jz L(zero_4_7)
539	sbbl %eax, %eax
540	orl $`1`, %eax
541	L(zero_4_7):
542	ret
543	# endif
544
545	END (MEMCMP)
546	#endif
547

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S