memcmp-sse2.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-sse2.S]

1	/ memcmp with SSE2.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19
20	#include <isa-level.h>
21
22	/ MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation*
23	so we need this to build for ISA V2 builds. /*
24	#if ISA_SHOULD_BUILD (2)
25
26	#include <sysdep.h>
27
28	# ifndef MEMCMP
29	# define MEMCMP __memcmp_sse2
30	# endif
31
32	# ifdef USE_AS_WMEMCMP
33	# define PCMPEQ pcmpeqd
34	# define CHAR_SIZE 4
35	# define SIZE_OFFSET (0)
36	# else
37	# define PCMPEQ pcmpeqb
38	# define CHAR_SIZE 1
39	# endif
40
41	# ifdef USE_AS_MEMCMPEQ
42	# define SIZE_OFFSET (0)
43	# define CHECK_CMP(x, y) subl x, y
44	# else
45	# ifndef SIZE_OFFSET
46	# define SIZE_OFFSET (CHAR_PER_VEC * 2)
47	# endif
48	# define CHECK_CMP(x, y) cmpl x, y
49	# endif
50
51	# define VEC_SIZE 16
52	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54	# ifndef MEMCMP
55	# define MEMCMP memcmp
56	# endif
57
58	.text
59	ENTRY(MEMCMP)
60	# ifdef __ILP32__
61	/ Clear the upper 32 bits. /
62	movl %edx, %edx
63	# endif
64	# ifdef USE_AS_WMEMCMP
65	/ Use 0xffff to test for mismatches on pmovmskb bitmask. Store*
66	in ecx for code size. This is preferable to using `incw` as
67	it avoids partial register stalls on older hardware (pre
68	SnB). /*
69	movl $`0xffff`, %ecx
70	# endif
71	cmpq $CHAR_PER_VEC, %rdx
72	ja L(more_1x_vec)
73
74	# ifdef USE_AS_WMEMCMP
75	/ saves a byte of code keeping the fall through path n = [2, 4]*
76	in the initial cache line. /*
77	decl %edx
78	jle L(cmp_0_1)
79
80	movq (%rsi), %xmm0
81	movq (%rdi), %xmm1
82	PCMPEQ %xmm0, %xmm1
83	pmovmskb %xmm1, %eax
84	subl %ecx, %eax
85	jnz L(ret_nonzero_vec_start_0)
86
87	movq -`4`(%rsi, %rdx, CHAR_SIZE), %xmm0
88	movq -`4`(%rdi, %rdx, CHAR_SIZE), %xmm1
89	PCMPEQ %xmm0, %xmm1
90	pmovmskb %xmm1, %eax
91	subl %ecx, %eax
92	jnz L(ret_nonzero_vec_end_0_adj)
93	# else
94	cmpl $`8`, %edx
95	ja L(cmp_9_16)
96
97	cmpl $`4`, %edx
98	jb L(cmp_0_3)
99
100	# ifdef USE_AS_MEMCMPEQ
101	movl (%rsi), %eax
102	subl (%rdi), %eax
103
104	movl -`4`(%rsi, %rdx), %esi
105	subl -`4`(%rdi, %rdx), %esi
106
107	orl %esi, %eax
108	ret
109	# else
110	/ Combine comparisons for lo and hi 4-byte comparisons. /
111	movl -`4`(%rsi, %rdx), %ecx
112	movl -`4`(%rdi, %rdx), %eax
113	shlq $`32`, %rcx
114	shlq $`32`, %rax
115	movl (%rsi), %esi
116	movl (%rdi), %edi
117	orq %rsi, %rcx
118	orq %rdi, %rax
119	/ Only compute proper return if not-equal. /
120	cmpq %rcx, %rax
121	jnz L(ret_nonzero)
122	xorl %eax, %eax
123	ret
124	# endif
125
126	.p2align `4`,, `10`
127	L(cmp_9_16):
128	# ifdef USE_AS_MEMCMPEQ
129	movq (%rsi), %rax
130	subq (%rdi), %rax
131
132	movq -`8`(%rsi, %rdx), %rcx
133	subq -`8`(%rdi, %rdx), %rcx
134	orq %rcx, %rax
135	/ Convert 64 bit -> 32 bit boolean (we should have made the ABI*
136	return long). /*
137	setnz %cl
138	movzbl %cl, %eax
139	# else
140	movq (%rsi), %rcx
141	movq (%rdi), %rax
142	/ Only compute proper return if not-equal. /
143	cmpq %rcx, %rax
144	jnz L(ret_nonzero)
145
146	movq -`8`(%rsi, %rdx, CHAR_SIZE), %rcx
147	movq -`8`(%rdi, %rdx, CHAR_SIZE), %rax
148	/ Only compute proper return if not-equal. /
149	cmpq %rcx, %rax
150	jnz L(ret_nonzero)
151	xorl %eax, %eax
152	# endif
153	# endif
154	ret
155
156	.p2align `4`,, `8`
157	L(cmp_0_1):
158	/ Flag set by earlier comparison against 1. /
159	jne L(cmp_0_0)
160	# ifdef USE_AS_WMEMCMP
161	movl (%rdi), %ecx
162	xorl %edx, %edx
163	cmpl (%rsi), %ecx
164	je L(cmp_0_0)
165	setg %dl
166	leal -`1`(%rdx, %rdx), %eax
167	# else
168	movzbl (%rdi), %eax
169	movzbl (%rsi), %ecx
170	subl %ecx, %eax
171	# endif
172	ret
173
174	/ Fits in aligning bytes. /
175	L(cmp_0_0):
176	xorl %eax, %eax
177	ret
178
179	# ifdef USE_AS_WMEMCMP
180	.p2align `4`
181	L(ret_nonzero_vec_start_0):
182	bsfl %eax, %eax
183	movl (%rdi, %rax), %ecx
184	xorl %edx, %edx
185	cmpl (%rsi, %rax), %ecx
186	/ NB: no partial register stall here because xorl zero idiom*
187	above. /*
188	setg %dl
189	leal -`1`(%rdx, %rdx), %eax
190	ret
191	# else
192
193	# ifndef USE_AS_MEMCMPEQ
194	.p2align `4`,, `14`
195	L(ret_nonzero):
196	/ Need to bswap to get proper return without branch. /
197	bswapq %rcx
198	bswapq %rax
199	subq %rcx, %rax
200	sbbl %eax, %eax
201	orl $`1`, %eax
202	ret
203	# endif
204
205	.p2align `4`
206	L(cmp_0_3):
207	# ifdef USE_AS_MEMCMPEQ
208	/ No reason to add to dependency chain on rdx. Saving a the*
209	bytes here doesn't change number of fetch blocks. /*
210	cmpl $`1`, %edx
211	jbe L(cmp_0_1)
212	# else
213	/ We need the code size to prevent taking an extra fetch block.*
214	*/
215	decl %edx
216	jle L(cmp_0_1)
217	# endif
218	movzwl (%rsi), %ecx
219	movzwl (%rdi), %eax
220
221	# ifdef USE_AS_MEMCMPEQ
222	subl %ecx, %eax
223
224	movzbl -`1`(%rsi, %rdx), %esi
225	movzbl -`1`(%rdi, %rdx), %edi
226	subl %edi, %esi
227	orl %esi, %eax
228	# else
229	bswapl %ecx
230	bswapl %eax
231
232	/ Implicit right shift by one. We just need to displace the*
233	sign bits. /*
234	shrl %ecx
235	shrl %eax
236
237	/ Eat a partial register stall here. Saves code stopping*
238	L(cmp_0_3) from bleeding into the next fetch block and saves
239	an ALU. /*
240	movb (%rsi, %rdx), %cl
241	movzbl (%rdi, %rdx), %edi
242	orl %edi, %eax
243	subl %ecx, %eax
244	# endif
245	ret
246	# endif
247
248	.p2align `5`
249	L(more_1x_vec):
250	# ifndef USE_AS_WMEMCMP
251	/ Use 0xffff to test for mismatches on pmovmskb bitmask. Store*
252	in ecx for code size. This is preferable to using `incw` as
253	it avoids partial register stalls on older hardware (pre
254	SnB). /*
255	movl $`0xffff`, %ecx
256	# endif
257	movups (%rsi), %xmm0
258	movups (%rdi), %xmm1
259	PCMPEQ %xmm0, %xmm1
260	pmovmskb %xmm1, %eax
261	subl %ecx, %eax
262	jnz L(ret_nonzero_vec_start_0)
263	# if SIZE_OFFSET == 0
264	cmpq $(CHAR_PER_VEC * `2`), %rdx
265	# else
266	/ Offset rdx. Saves just enough code size to keep the*
267	L(last_2x_vec) case and the non-zero return in a single
268	cache line. /*
269	subq $(CHAR_PER_VEC * `2`), %rdx
270	# endif
271	ja L(more_2x_vec)
272
273	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
274	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
275	PCMPEQ %xmm0, %xmm1
276	pmovmskb %xmm1, %eax
277	subl %ecx, %eax
278	# ifndef USE_AS_MEMCMPEQ
279	/ Don't use `incw ax` as machines this code runs on are liable*
280	to have partial register stall. /*
281	jnz L(ret_nonzero_vec_end_0)
282	# else
283	/ Various return targets for memcmpeq. Will always be hot in*
284	Icache and get short encoding. /*
285	L(ret_nonzero_vec_start_1):
286	L(ret_nonzero_vec_start_0):
287	L(ret_nonzero_vec_end_0):
288	# endif
289	ret
290
291	# ifndef USE_AS_MEMCMPEQ
292	# ifdef USE_AS_WMEMCMP
293	.p2align `4`
294	L(ret_nonzero_vec_end_0_adj):
295	addl $`3`, %edx
296	# else
297	.p2align `4`,, `8`
298	# endif
299	L(ret_nonzero_vec_end_0):
300	bsfl %eax, %eax
301	# ifdef USE_AS_WMEMCMP
302	leal (%rax, %rdx, CHAR_SIZE), %eax
303	movl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rax), %ecx
304	xorl %edx, %edx
305	cmpl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rax), %ecx
306	/ NB: no partial register stall here because xorl zero idiom*
307	above. /*
308	setg %dl
309	leal -`1`(%rdx, %rdx), %eax
310	# else
311	addl %edx, %eax
312	movzbl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rax), %ecx
313	movzbl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rax), %eax
314	subl %ecx, %eax
315	# endif
316	ret
317	# ifndef USE_AS_WMEMCMP
318	.p2align `4`,, `10`
319	L(ret_nonzero_vec_start_0):
320	bsfl %eax, %eax
321	movzbl (%rsi, %rax), %ecx
322	movzbl (%rdi, %rax), %eax
323	subl %ecx, %eax
324	ret
325	# endif
326	# else
327	# endif
328
329	.p2align `5`
330	L(more_2x_vec):
331	movups (VEC_SIZE * `1`)(%rsi), %xmm0
332	movups (VEC_SIZE * `1`)(%rdi), %xmm1
333	PCMPEQ %xmm0, %xmm1
334	pmovmskb %xmm1, %eax
335	subl %ecx, %eax
336	jnz L(ret_nonzero_vec_start_1)
337
338	cmpq $(CHAR_PER_VEC * `4` - SIZE_OFFSET), %rdx
339	jbe L(last_2x_vec)
340
341	cmpq $(CHAR_PER_VEC * `8` - SIZE_OFFSET), %rdx
342	ja L(more_8x_vec)
343
344	/ Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.*
345	This can harm performance if non-zero return in [65, 80] or
346	[97, 112] but helps performance otherwise. Generally zero-
347	return is hotter. /*
348	movups (VEC_SIZE * `2`)(%rsi), %xmm0
349	movups (VEC_SIZE * `2`)(%rdi), %xmm1
350	PCMPEQ %xmm0, %xmm1
351	movups (VEC_SIZE * `3`)(%rsi), %xmm2
352	movups (VEC_SIZE * `3`)(%rdi), %xmm3
353	PCMPEQ %xmm2, %xmm3
354	pand %xmm1, %xmm3
355
356	pmovmskb %xmm3, %eax
357	CHECK_CMP (%ecx, %eax)
358	jnz L(ret_nonzero_vec_start_2_3)
359
360	cmpl $(CHAR_PER_VEC * `6` - SIZE_OFFSET), %edx
361	jbe L(last_2x_vec)
362
363	movups (VEC_SIZE * `4`)(%rsi), %xmm0
364	movups (VEC_SIZE * `4`)(%rdi), %xmm1
365	PCMPEQ %xmm0, %xmm1
366	movups (VEC_SIZE * `5`)(%rsi), %xmm2
367	movups (VEC_SIZE * `5`)(%rdi), %xmm3
368	PCMPEQ %xmm2, %xmm3
369	pand %xmm1, %xmm3
370
371	pmovmskb %xmm3, %eax
372	CHECK_CMP (%ecx, %eax)
373	# ifdef USE_AS_MEMCMPEQ
374	jz L(last_2x_vec)
375	ret
376	# else
377	jnz L(ret_nonzero_vec_start_4_5)
378	# endif
379	.p2align `4`
380	L(last_2x_vec):
381	movups (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
382	movups (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
383	PCMPEQ %xmm0, %xmm1
384	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
385	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
386	PCMPEQ %xmm2, %xmm3
387	pand %xmm1, %xmm3
388	pmovmskb %xmm3, %eax
389	subl %ecx, %eax
390	# ifdef USE_AS_MEMCMPEQ
391	/ Various return targets for memcmpeq. Will always be hot in*
392	Icache and get short encoding. /*
393	L(ret_nonzero_vec_start_2_3):
394	L(ret_nonzero_vec_start_4_5):
395	ret
396	# else
397	jnz L(ret_nonzero_vec_end_1)
398	ret
399
400	.p2align `4`,, `8`
401	L(ret_nonzero_vec_end_1):
402	pmovmskb %xmm1, %ecx
403	/ High 16 bits of eax guranteed to be all ones. Rotate them in*
404	to we can do `or + not` with just `xor`. /*
405	rorl $`16`, %eax
406	xorl %ecx, %eax
407	/ Partial register stall. /
408
409	bsfl %eax, %eax
410	# ifdef USE_AS_WMEMCMP
411	leal (%rax, %rdx, CHAR_SIZE), %eax
412	movl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rax), %ecx
413	xorl %edx, %edx
414	cmpl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rax), %ecx
415	/ NB: no partial register stall here because xorl zero idiom*
416	above. /*
417	setg %dl
418	leal -`1`(%rdx, %rdx), %eax
419	# else
420	addl %edx, %eax
421	movzbl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rax), %ecx
422	movzbl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rax), %eax
423	subl %ecx, %eax
424	# endif
425	ret
426
427	.p2align `4`
428	L(ret_nonzero_vec_start_4_5):
429	pmovmskb %xmm1, %edx
430	sall $`16`, %eax
431	leal `1`(%rax, %rdx), %eax
432	bsfl %eax, %eax
433	# ifdef USE_AS_WMEMCMP
434	movl (VEC_SIZE * `4`)(%rdi, %rax), %ecx
435	xorl %edx, %edx
436	cmpl (VEC_SIZE * `4`)(%rsi, %rax), %ecx
437	/ NB: no partial register stall here because xorl zero idiom*
438	above. /*
439	setg %dl
440	leal -`1`(%rdx, %rdx), %eax
441	# else
442	movzbl (VEC_SIZE * `4`)(%rsi, %rax), %ecx
443	movzbl (VEC_SIZE * `4`)(%rdi, %rax), %eax
444	subl %ecx, %eax
445	# endif
446	ret
447
448	.p2align `4`,, `8`
449	L(ret_nonzero_vec_start_1):
450	bsfl %eax, %eax
451	# ifdef USE_AS_WMEMCMP
452	movl (VEC_SIZE * `1`)(%rdi, %rax), %ecx
453	xorl %edx, %edx
454	cmpl (VEC_SIZE * `1`)(%rsi, %rax), %ecx
455	/ NB: no partial register stall here because xorl zero idiom*
456	above. /*
457	setg %dl
458	leal -`1`(%rdx, %rdx), %eax
459	# else
460	movzbl (VEC_SIZE * `1`)(%rsi, %rax), %ecx
461	movzbl (VEC_SIZE * `1`)(%rdi, %rax), %eax
462	subl %ecx, %eax
463	# endif
464	ret
465	# endif
466
467	.p2align `4`
468	L(more_8x_vec):
469	subq %rdi, %rsi
470	leaq (VEC_SIZE * -`6` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
471	andq $(VEC_SIZE * -`1`), %rdi
472	addq %rdi, %rsi
473	.p2align `4`
474	L(loop_4x):
475	movups (VEC_SIZE * `2`)(%rsi), %xmm0
476	movups (VEC_SIZE * `3`)(%rsi), %xmm1
477
478	PCMPEQ (VEC_SIZE * `2`)(%rdi), %xmm0
479	PCMPEQ (VEC_SIZE * `3`)(%rdi), %xmm1
480
481	movups (VEC_SIZE * `4`)(%rsi), %xmm2
482	movups (VEC_SIZE * `5`)(%rsi), %xmm3
483
484	PCMPEQ (VEC_SIZE * `4`)(%rdi), %xmm2
485	PCMPEQ (VEC_SIZE * `5`)(%rdi), %xmm3
486
487	pand %xmm0, %xmm1
488	pand %xmm2, %xmm3
489	pand %xmm1, %xmm3
490
491	pmovmskb %xmm3, %eax
492	subl %ecx, %eax
493	jnz L(ret_nonzero_loop)
494
495	addq $(VEC_SIZE * `4`), %rdi
496	addq $(VEC_SIZE * `4`), %rsi
497	cmpq %rdi, %rdx
498	ja L(loop_4x)
499	/ Get remaining length in edx. /
500	subl %edi, %edx
501	/ Restore offset so we can reuse L(last_2x_vec). /
502	addl $(VEC_SIZE * `6` - SIZE_OFFSET), %edx
503	# ifdef USE_AS_WMEMCMP
504	shrl $`2`, %edx
505	# endif
506	cmpl $(CHAR_PER_VEC * `4` - SIZE_OFFSET), %edx
507	jbe L(last_2x_vec)
508
509
510	movups (VEC_SIZE * `2`)(%rsi), %xmm0
511	movups (VEC_SIZE * `2`)(%rdi), %xmm1
512	PCMPEQ %xmm0, %xmm1
513	movups (VEC_SIZE * `3`)(%rsi), %xmm2
514	movups (VEC_SIZE * `3`)(%rdi), %xmm3
515	PCMPEQ %xmm2, %xmm3
516	pand %xmm1, %xmm3
517
518	pmovmskb %xmm3, %eax
519	CHECK_CMP (%ecx, %eax)
520	jz L(last_2x_vec)
521	# ifdef USE_AS_MEMCMPEQ
522	L(ret_nonzero_loop):
523	ret
524	# else
525
526	.p2align `4`
527	L(ret_nonzero_vec_start_2_3):
528	pmovmskb %xmm1, %edx
529	sall $`16`, %eax
530	leal `1`(%rax, %rdx), %eax
531
532	bsfl %eax, %eax
533	# ifdef USE_AS_WMEMCMP
534	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
535	xorl %edx, %edx
536	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
537	/ NB: no partial register stall here because xorl zero idiom*
538	above. /*
539	setg %dl
540	leal -`1`(%rdx, %rdx), %eax
541	# else
542	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
543	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
544	subl %ecx, %eax
545	# endif
546	ret
547
548	.p2align `4`
549	L(ret_nonzero_loop):
550	pmovmskb %xmm0, %ecx
551	pmovmskb %xmm1, %edx
552	sall $(VEC_SIZE * `1`), %edx
553	leal `1`(%rcx, %rdx), %edx
554	pmovmskb %xmm2, %ecx
555	/ High 16 bits of eax guranteed to be all ones. Rotate them in*
556	to we can do `or + not` with just `xor`. /*
557	rorl $`16`, %eax
558	xorl %ecx, %eax
559
560	salq $`32`, %rax
561	orq %rdx, %rax
562
563	bsfq %rax, %rax
564	# ifdef USE_AS_WMEMCMP
565	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
566	xorl %edx, %edx
567	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
568	/ NB: no partial register stall here because xorl zero idiom*
569	above. /*
570	setg %dl
571	leal -`1`(%rdx, %rdx), %eax
572	# else
573	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
574	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
575	subl %ecx, %eax
576	# endif
577	ret
578	# endif
579	END(MEMCMP)
580	#endif
581

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcmp-sse2.S