1 | /* __memcmpeq optimized with EVEX. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | /* __memcmpeq is implemented as: |
24 | 1. Use ymm vector compares when possible. The only case where |
25 | vector compares is not possible for when size < VEC_SIZE |
26 | and loading from either s1 or s2 would cause a page cross. |
27 | 2. Use xmm vector compare when size >= 8 bytes. |
28 | 3. Optimistically compare up to first 4 * VEC_SIZE one at a |
29 | to check for early mismatches. Only do this if its guranteed the |
30 | work is not wasted. |
31 | 4. If size is 8 * VEC_SIZE or less, unroll the loop. |
32 | 5. Compare 4 * VEC_SIZE at a time with the aligned first memory |
33 | area. |
34 | 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. |
35 | 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. |
36 | 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ |
37 | |
38 | # include <sysdep.h> |
39 | |
40 | # ifndef MEMCMPEQ |
41 | # define MEMCMPEQ __memcmpeq_evex |
42 | # endif |
43 | |
44 | # define VMOVU_MASK vmovdqu8 |
45 | # define VMOVU vmovdqu64 |
46 | # define VPCMP vpcmpub |
47 | # define VPTEST vptestmb |
48 | |
49 | # define VEC_SIZE 32 |
50 | # define PAGE_SIZE 4096 |
51 | |
52 | # define YMM0 ymm16 |
53 | # define YMM1 ymm17 |
54 | # define YMM2 ymm18 |
55 | # define YMM3 ymm19 |
56 | # define YMM4 ymm20 |
57 | # define YMM5 ymm21 |
58 | # define YMM6 ymm22 |
59 | |
60 | |
61 | .section .text.evex, "ax" , @progbits |
62 | ENTRY_P2ALIGN (MEMCMPEQ, 6) |
63 | # ifdef __ILP32__ |
64 | /* Clear the upper 32 bits. */ |
65 | movl %edx, %edx |
66 | # endif |
67 | cmp $VEC_SIZE, %RDX_LP |
68 | /* Fall through for [0, VEC_SIZE] as its the hottest. */ |
69 | ja L(more_1x_vec) |
70 | |
71 | /* Create mask of bytes that are guranteed to be valid because |
72 | of length (edx). Using masked movs allows us to skip checks for |
73 | page crosses/zero size. */ |
74 | movl $-1, %ecx |
75 | bzhil %edx, %ecx, %ecx |
76 | kmovd %ecx, %k2 |
77 | |
78 | /* Use masked loads as VEC_SIZE could page cross where length |
79 | (edx) would not. */ |
80 | VMOVU_MASK (%rsi), %YMM2{%k2} |
81 | VPCMP $4,(%rdi), %YMM2, %k1{%k2} |
82 | kmovd %k1, %eax |
83 | ret |
84 | |
85 | |
86 | L(last_1x_vec): |
87 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 |
88 | VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 |
89 | kmovd %k1, %eax |
90 | L(return_neq0): |
91 | ret |
92 | |
93 | |
94 | |
95 | .p2align 4 |
96 | L(more_1x_vec): |
97 | /* From VEC + 1 to 2 * VEC. */ |
98 | VMOVU (%rsi), %YMM1 |
99 | /* Use compare not equals to directly check for mismatch. */ |
100 | VPCMP $4,(%rdi), %YMM1, %k1 |
101 | kmovd %k1, %eax |
102 | testl %eax, %eax |
103 | jnz L(return_neq0) |
104 | |
105 | cmpq $(VEC_SIZE * 2), %rdx |
106 | jbe L(last_1x_vec) |
107 | |
108 | /* Check second VEC no matter what. */ |
109 | VMOVU VEC_SIZE(%rsi), %YMM2 |
110 | VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 |
111 | kmovd %k1, %eax |
112 | testl %eax, %eax |
113 | jnz L(return_neq0) |
114 | |
115 | /* Less than 4 * VEC. */ |
116 | cmpq $(VEC_SIZE * 4), %rdx |
117 | jbe L(last_2x_vec) |
118 | |
119 | /* Check third and fourth VEC no matter what. */ |
120 | VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 |
121 | VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 |
122 | kmovd %k1, %eax |
123 | testl %eax, %eax |
124 | jnz L(return_neq0) |
125 | |
126 | VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 |
127 | VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 |
128 | kmovd %k1, %eax |
129 | testl %eax, %eax |
130 | jnz L(return_neq0) |
131 | |
132 | /* Go to 4x VEC loop. */ |
133 | cmpq $(VEC_SIZE * 8), %rdx |
134 | ja L(more_8x_vec) |
135 | |
136 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
137 | branches. */ |
138 | |
139 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1 |
140 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2 |
141 | addq %rdx, %rdi |
142 | |
143 | /* Wait to load from s1 until addressed adjust due to |
144 | unlamination. */ |
145 | |
146 | /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it |
147 | will have some 1s. */ |
148 | vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1 |
149 | /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while |
150 | oring with YMM1. Result is stored in YMM1. */ |
151 | vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2 |
152 | |
153 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3 |
154 | vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 |
155 | /* Or together YMM1, YMM2, and YMM3 into YMM3. */ |
156 | VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4 |
157 | vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4 |
158 | |
159 | /* Or together YMM2, YMM3, and YMM4 into YMM4. */ |
160 | vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 |
161 | |
162 | /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ |
163 | VPTEST %YMM4, %YMM4, %k1 |
164 | kmovd %k1, %eax |
165 | ret |
166 | |
167 | .p2align 4 |
168 | L(more_8x_vec): |
169 | /* Set end of s1 in rdx. */ |
170 | leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx |
171 | /* rsi stores s2 - s1. This allows loop to only update one |
172 | pointer. */ |
173 | subq %rdi, %rsi |
174 | /* Align s1 pointer. */ |
175 | andq $-VEC_SIZE, %rdi |
176 | /* Adjust because first 4x vec where check already. */ |
177 | subq $-(VEC_SIZE * 4), %rdi |
178 | .p2align 4 |
179 | L(loop_4x_vec): |
180 | VMOVU (%rsi, %rdi), %YMM1 |
181 | vpxorq (%rdi), %YMM1, %YMM1 |
182 | |
183 | VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 |
184 | vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2 |
185 | |
186 | VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 |
187 | vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 |
188 | |
189 | VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 |
190 | vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4 |
191 | |
192 | vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 |
193 | VPTEST %YMM4, %YMM4, %k1 |
194 | kmovd %k1, %eax |
195 | testl %eax, %eax |
196 | jnz L(return_neq2) |
197 | subq $-(VEC_SIZE * 4), %rdi |
198 | cmpq %rdx, %rdi |
199 | jb L(loop_4x_vec) |
200 | |
201 | subq %rdx, %rdi |
202 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 |
203 | vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4 |
204 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
205 | cmpl $(VEC_SIZE * 3), %edi |
206 | jae L(8x_last_1x_vec) |
207 | /* Load regardless of branch. */ |
208 | VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 |
209 | /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while |
210 | oring with YMM4. Result is stored in YMM4. */ |
211 | vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 |
212 | cmpl $(VEC_SIZE * 2), %edi |
213 | jae L(8x_last_2x_vec) |
214 | |
215 | VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 |
216 | vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 |
217 | |
218 | VMOVU (%rsi, %rdx), %YMM1 |
219 | vpxorq (%rdx), %YMM1, %YMM1 |
220 | |
221 | vpternlogd $0xfe, %YMM1, %YMM2, %YMM4 |
222 | L(8x_last_1x_vec): |
223 | L(8x_last_2x_vec): |
224 | VPTEST %YMM4, %YMM4, %k1 |
225 | kmovd %k1, %eax |
226 | L(return_neq2): |
227 | ret |
228 | |
229 | .p2align 4,, 8 |
230 | L(last_2x_vec): |
231 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 |
232 | vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 |
233 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 |
234 | vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 |
235 | VPTEST %YMM2, %YMM2, %k1 |
236 | kmovd %k1, %eax |
237 | ret |
238 | |
239 | /* 1 Bytes from next cache line. */ |
240 | END (MEMCMPEQ) |
241 | #endif |
242 | |