1/* __memcmpeq optimized with EVEX.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23/* __memcmpeq is implemented as:
24 1. Use ymm vector compares when possible. The only case where
25 vector compares is not possible for when size < VEC_SIZE
26 and loading from either s1 or s2 would cause a page cross.
27 2. Use xmm vector compare when size >= 8 bytes.
28 3. Optimistically compare up to first 4 * VEC_SIZE one at a
29 to check for early mismatches. Only do this if its guranteed the
30 work is not wasted.
31 4. If size is 8 * VEC_SIZE or less, unroll the loop.
32 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
33 area.
34 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
35 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
36 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
37
38# include <sysdep.h>
39
40# ifndef MEMCMPEQ
41# define MEMCMPEQ __memcmpeq_evex
42# endif
43
44# define VMOVU_MASK vmovdqu8
45# define VMOVU vmovdqu64
46# define VPCMP vpcmpub
47# define VPTEST vptestmb
48
49# define VEC_SIZE 32
50# define PAGE_SIZE 4096
51
52# define YMM0 ymm16
53# define YMM1 ymm17
54# define YMM2 ymm18
55# define YMM3 ymm19
56# define YMM4 ymm20
57# define YMM5 ymm21
58# define YMM6 ymm22
59
60
61 .section .text.evex, "ax", @progbits
62ENTRY_P2ALIGN (MEMCMPEQ, 6)
63# ifdef __ILP32__
64 /* Clear the upper 32 bits. */
65 movl %edx, %edx
66# endif
67 cmp $VEC_SIZE, %RDX_LP
68 /* Fall through for [0, VEC_SIZE] as its the hottest. */
69 ja L(more_1x_vec)
70
71 /* Create mask of bytes that are guranteed to be valid because
72 of length (edx). Using masked movs allows us to skip checks for
73 page crosses/zero size. */
74 movl $-1, %ecx
75 bzhil %edx, %ecx, %ecx
76 kmovd %ecx, %k2
77
78 /* Use masked loads as VEC_SIZE could page cross where length
79 (edx) would not. */
80 VMOVU_MASK (%rsi), %YMM2{%k2}
81 VPCMP $4,(%rdi), %YMM2, %k1{%k2}
82 kmovd %k1, %eax
83 ret
84
85
86L(last_1x_vec):
87 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
88 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
89 kmovd %k1, %eax
90L(return_neq0):
91 ret
92
93
94
95 .p2align 4
96L(more_1x_vec):
97 /* From VEC + 1 to 2 * VEC. */
98 VMOVU (%rsi), %YMM1
99 /* Use compare not equals to directly check for mismatch. */
100 VPCMP $4,(%rdi), %YMM1, %k1
101 kmovd %k1, %eax
102 testl %eax, %eax
103 jnz L(return_neq0)
104
105 cmpq $(VEC_SIZE * 2), %rdx
106 jbe L(last_1x_vec)
107
108 /* Check second VEC no matter what. */
109 VMOVU VEC_SIZE(%rsi), %YMM2
110 VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
111 kmovd %k1, %eax
112 testl %eax, %eax
113 jnz L(return_neq0)
114
115 /* Less than 4 * VEC. */
116 cmpq $(VEC_SIZE * 4), %rdx
117 jbe L(last_2x_vec)
118
119 /* Check third and fourth VEC no matter what. */
120 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
121 VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
122 kmovd %k1, %eax
123 testl %eax, %eax
124 jnz L(return_neq0)
125
126 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
127 VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
128 kmovd %k1, %eax
129 testl %eax, %eax
130 jnz L(return_neq0)
131
132 /* Go to 4x VEC loop. */
133 cmpq $(VEC_SIZE * 8), %rdx
134 ja L(more_8x_vec)
135
136 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
137 branches. */
138
139 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
140 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
141 addq %rdx, %rdi
142
143 /* Wait to load from s1 until addressed adjust due to
144 unlamination. */
145
146 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
147 will have some 1s. */
148 vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
149 /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
150 oring with YMM1. Result is stored in YMM1. */
151 vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
152
153 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
154 vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
155 /* Or together YMM1, YMM2, and YMM3 into YMM3. */
156 VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
157 vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
158
159 /* Or together YMM2, YMM3, and YMM4 into YMM4. */
160 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
161
162 /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
163 VPTEST %YMM4, %YMM4, %k1
164 kmovd %k1, %eax
165 ret
166
167 .p2align 4
168L(more_8x_vec):
169 /* Set end of s1 in rdx. */
170 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
171 /* rsi stores s2 - s1. This allows loop to only update one
172 pointer. */
173 subq %rdi, %rsi
174 /* Align s1 pointer. */
175 andq $-VEC_SIZE, %rdi
176 /* Adjust because first 4x vec where check already. */
177 subq $-(VEC_SIZE * 4), %rdi
178 .p2align 4
179L(loop_4x_vec):
180 VMOVU (%rsi, %rdi), %YMM1
181 vpxorq (%rdi), %YMM1, %YMM1
182
183 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
184 vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
185
186 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
187 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
188
189 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
190 vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
191
192 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
193 VPTEST %YMM4, %YMM4, %k1
194 kmovd %k1, %eax
195 testl %eax, %eax
196 jnz L(return_neq2)
197 subq $-(VEC_SIZE * 4), %rdi
198 cmpq %rdx, %rdi
199 jb L(loop_4x_vec)
200
201 subq %rdx, %rdi
202 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
203 vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
204 /* rdi has 4 * VEC_SIZE - remaining length. */
205 cmpl $(VEC_SIZE * 3), %edi
206 jae L(8x_last_1x_vec)
207 /* Load regardless of branch. */
208 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
209 /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
210 oring with YMM4. Result is stored in YMM4. */
211 vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
212 cmpl $(VEC_SIZE * 2), %edi
213 jae L(8x_last_2x_vec)
214
215 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
216 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
217
218 VMOVU (%rsi, %rdx), %YMM1
219 vpxorq (%rdx), %YMM1, %YMM1
220
221 vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
222L(8x_last_1x_vec):
223L(8x_last_2x_vec):
224 VPTEST %YMM4, %YMM4, %k1
225 kmovd %k1, %eax
226L(return_neq2):
227 ret
228
229 .p2align 4,, 8
230L(last_2x_vec):
231 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
232 vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
233 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
234 vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
235 VPTEST %YMM2, %YMM2, %k1
236 kmovd %k1, %eax
237 ret
238
239 /* 1 Bytes from next cache line. */
240END (MEMCMPEQ)
241#endif
242