1/* __memcmpeq optimized with EVEX.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21/* __memcmpeq is implemented as:
22 1. Use ymm vector compares when possible. The only case where
23 vector compares is not possible for when size < VEC_SIZE
24 and loading from either s1 or s2 would cause a page cross.
25 2. Use xmm vector compare when size >= 8 bytes.
26 3. Optimistically compare up to first 4 * VEC_SIZE one at a
27 to check for early mismatches. Only do this if its guranteed the
28 work is not wasted.
29 4. If size is 8 * VEC_SIZE or less, unroll the loop.
30 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
31 area.
32 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
33 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
34 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
35
36# include <sysdep.h>
37
38# ifndef MEMCMPEQ
39# define MEMCMPEQ __memcmpeq_evex
40# endif
41
42# define VMOVU_MASK vmovdqu8
43# define VMOVU vmovdqu64
44# define VPCMP vpcmpub
45# define VPTEST vptestmb
46
47# define VEC_SIZE 32
48# define PAGE_SIZE 4096
49
50# define YMM0 ymm16
51# define YMM1 ymm17
52# define YMM2 ymm18
53# define YMM3 ymm19
54# define YMM4 ymm20
55# define YMM5 ymm21
56# define YMM6 ymm22
57
58
59 .section .text.evex, "ax", @progbits
60ENTRY_P2ALIGN (MEMCMPEQ, 6)
61# ifdef __ILP32__
62 /* Clear the upper 32 bits. */
63 movl %edx, %edx
64# endif
65 cmp $VEC_SIZE, %RDX_LP
66 /* Fall through for [0, VEC_SIZE] as its the hottest. */
67 ja L(more_1x_vec)
68
69 /* Create mask of bytes that are guranteed to be valid because
70 of length (edx). Using masked movs allows us to skip checks for
71 page crosses/zero size. */
72 movl $-1, %ecx
73 bzhil %edx, %ecx, %ecx
74 kmovd %ecx, %k2
75
76 /* Use masked loads as VEC_SIZE could page cross where length
77 (edx) would not. */
78 VMOVU_MASK (%rsi), %YMM2{%k2}
79 VPCMP $4,(%rdi), %YMM2, %k1{%k2}
80 kmovd %k1, %eax
81 ret
82
83
84L(last_1x_vec):
85 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
86 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
87 kmovd %k1, %eax
88L(return_neq0):
89 ret
90
91
92
93 .p2align 4
94L(more_1x_vec):
95 /* From VEC + 1 to 2 * VEC. */
96 VMOVU (%rsi), %YMM1
97 /* Use compare not equals to directly check for mismatch. */
98 VPCMP $4,(%rdi), %YMM1, %k1
99 kmovd %k1, %eax
100 testl %eax, %eax
101 jnz L(return_neq0)
102
103 cmpq $(VEC_SIZE * 2), %rdx
104 jbe L(last_1x_vec)
105
106 /* Check second VEC no matter what. */
107 VMOVU VEC_SIZE(%rsi), %YMM2
108 VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
109 kmovd %k1, %eax
110 testl %eax, %eax
111 jnz L(return_neq0)
112
113 /* Less than 4 * VEC. */
114 cmpq $(VEC_SIZE * 4), %rdx
115 jbe L(last_2x_vec)
116
117 /* Check third and fourth VEC no matter what. */
118 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
119 VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
120 kmovd %k1, %eax
121 testl %eax, %eax
122 jnz L(return_neq0)
123
124 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
125 VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
126 kmovd %k1, %eax
127 testl %eax, %eax
128 jnz L(return_neq0)
129
130 /* Go to 4x VEC loop. */
131 cmpq $(VEC_SIZE * 8), %rdx
132 ja L(more_8x_vec)
133
134 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
135 branches. */
136
137 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
138 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
139 addq %rdx, %rdi
140
141 /* Wait to load from s1 until addressed adjust due to
142 unlamination. */
143
144 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
145 will have some 1s. */
146 vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
147 /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
148 oring with YMM1. Result is stored in YMM1. */
149 vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
150
151 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
152 vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
153 /* Or together YMM1, YMM2, and YMM3 into YMM3. */
154 VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
155 vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
156
157 /* Or together YMM2, YMM3, and YMM4 into YMM4. */
158 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
159
160 /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
161 VPTEST %YMM4, %YMM4, %k1
162 kmovd %k1, %eax
163 ret
164
165 .p2align 4
166L(more_8x_vec):
167 /* Set end of s1 in rdx. */
168 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
169 /* rsi stores s2 - s1. This allows loop to only update one
170 pointer. */
171 subq %rdi, %rsi
172 /* Align s1 pointer. */
173 andq $-VEC_SIZE, %rdi
174 /* Adjust because first 4x vec where check already. */
175 subq $-(VEC_SIZE * 4), %rdi
176 .p2align 4
177L(loop_4x_vec):
178 VMOVU (%rsi, %rdi), %YMM1
179 vpxorq (%rdi), %YMM1, %YMM1
180
181 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
182 vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
183
184 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
185 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
186
187 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
188 vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
189
190 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
191 VPTEST %YMM4, %YMM4, %k1
192 kmovd %k1, %eax
193 testl %eax, %eax
194 jnz L(return_neq2)
195 subq $-(VEC_SIZE * 4), %rdi
196 cmpq %rdx, %rdi
197 jb L(loop_4x_vec)
198
199 subq %rdx, %rdi
200 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
201 vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
202 /* rdi has 4 * VEC_SIZE - remaining length. */
203 cmpl $(VEC_SIZE * 3), %edi
204 jae L(8x_last_1x_vec)
205 /* Load regardless of branch. */
206 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
207 /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
208 oring with YMM4. Result is stored in YMM4. */
209 vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
210 cmpl $(VEC_SIZE * 2), %edi
211 jae L(8x_last_2x_vec)
212
213 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
214 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
215
216 VMOVU (%rsi, %rdx), %YMM1
217 vpxorq (%rdx), %YMM1, %YMM1
218
219 vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
220L(8x_last_1x_vec):
221L(8x_last_2x_vec):
222 VPTEST %YMM4, %YMM4, %k1
223 kmovd %k1, %eax
224L(return_neq2):
225 ret
226
227 .p2align 4,, 8
228L(last_2x_vec):
229 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
230 vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
231 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
232 vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
233 VPTEST %YMM2, %YMM2, %k1
234 kmovd %k1, %eax
235 ret
236
237 /* 1 Bytes from next cache line. */
238END (MEMCMPEQ)
239#endif
240