1/* memcmp with SSE2
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21 .text
22ENTRY (memcmp)
23#ifdef __ILP32__
24 /* Clear the upper 32 bits. */
25 movl %edx, %edx
26#endif
27 test %RDX_LP, %RDX_LP
28 jz L(finz)
29 cmpq $1, %rdx
30 jbe L(finr1b)
31 subq %rdi, %rsi
32 movq %rdx, %r10
33 cmpq $32, %r10
34 jae L(gt32)
35 /* Handle small chunks and last block of less than 32 bytes. */
36L(small):
37 testq $1, %r10
38 jz L(s2b)
39 movzbl (%rdi), %eax
40 movzbl (%rdi, %rsi), %edx
41 subq $1, %r10
42 je L(finz1)
43 addq $1, %rdi
44 subl %edx, %eax
45 jnz L(exit)
46L(s2b):
47 testq $2, %r10
48 jz L(s4b)
49 movzwl (%rdi), %eax
50 movzwl (%rdi, %rsi), %edx
51 subq $2, %r10
52#ifdef USE_AS_MEMCMPEQ
53 je L(finz1)
54#else
55 je L(fin2_7)
56#endif
57 addq $2, %rdi
58 cmpl %edx, %eax
59#ifdef USE_AS_MEMCMPEQ
60 jnz L(neq_early)
61#else
62 jnz L(fin2_7)
63#endif
64L(s4b):
65 testq $4, %r10
66 jz L(s8b)
67 movl (%rdi), %eax
68 movl (%rdi, %rsi), %edx
69 subq $4, %r10
70#ifdef USE_AS_MEMCMPEQ
71 je L(finz1)
72#else
73 je L(fin2_7)
74#endif
75 addq $4, %rdi
76 cmpl %edx, %eax
77#ifdef USE_AS_MEMCMPEQ
78 jnz L(neq_early)
79#else
80 jnz L(fin2_7)
81#endif
82L(s8b):
83 testq $8, %r10
84 jz L(s16b)
85 movq (%rdi), %rax
86 movq (%rdi, %rsi), %rdx
87 subq $8, %r10
88#ifdef USE_AS_MEMCMPEQ
89 je L(sub_return8)
90#else
91 je L(fin2_7)
92#endif
93 addq $8, %rdi
94 cmpq %rdx, %rax
95#ifdef USE_AS_MEMCMPEQ
96 jnz L(neq_early)
97#else
98 jnz L(fin2_7)
99#endif
100L(s16b):
101 movdqu (%rdi), %xmm1
102 movdqu (%rdi, %rsi), %xmm0
103 pcmpeqb %xmm0, %xmm1
104#ifdef USE_AS_MEMCMPEQ
105 pmovmskb %xmm1, %eax
106 subl $0xffff, %eax
107 ret
108#else
109 pmovmskb %xmm1, %edx
110 xorl %eax, %eax
111 subl $0xffff, %edx
112 jz L(finz)
113 bsfl %edx, %ecx
114 leaq (%rdi, %rcx), %rcx
115 movzbl (%rcx), %eax
116 movzbl (%rsi, %rcx), %edx
117 jmp L(finz1)
118#endif
119 .p2align 4,, 4
120L(finr1b):
121 movzbl (%rdi), %eax
122 movzbl (%rsi), %edx
123L(finz1):
124 subl %edx, %eax
125L(exit):
126 ret
127#ifdef USE_AS_MEMCMPEQ
128 .p2align 4,, 4
129L(sub_return8):
130 subq %rdx, %rax
131 movl %eax, %edx
132 shrq $32, %rax
133 orl %edx, %eax
134 ret
135#else
136 .p2align 4,, 4
137L(fin2_7):
138 cmpq %rdx, %rax
139 jz L(finz)
140 movq %rax, %r11
141 subq %rdx, %r11
142 bsfq %r11, %rcx
143 sarq $3, %rcx
144 salq $3, %rcx
145 sarq %cl, %rax
146 movzbl %al, %eax
147 sarq %cl, %rdx
148 movzbl %dl, %edx
149 subl %edx, %eax
150 ret
151#endif
152 .p2align 4,, 4
153L(finz):
154 xorl %eax, %eax
155 ret
156#ifdef USE_AS_MEMCMPEQ
157 .p2align 4,, 4
158L(neq_early):
159 movl $1, %eax
160 ret
161#endif
162 /* For blocks bigger than 32 bytes
163 1. Advance one of the addr pointer to be 16B aligned.
164 2. Treat the case of both addr pointers aligned to 16B
165 separately to avoid movdqu.
166 3. Handle any blocks of greater than 64 consecutive bytes with
167 unrolling to reduce branches.
168 4. At least one addr pointer is 16B aligned, use memory version
169 of pcmbeqb.
170 */
171 .p2align 4,, 4
172L(gt32):
173 movq %rdx, %r11
174 addq %rdi, %r11
175 movq %rdi, %r8
176
177 andq $15, %r8
178 jz L(16am)
179 /* Both pointers may be misaligned. */
180 movdqu (%rdi), %xmm1
181 movdqu (%rdi, %rsi), %xmm0
182 pcmpeqb %xmm0, %xmm1
183 pmovmskb %xmm1, %edx
184 subl $0xffff, %edx
185 jnz L(neq)
186 neg %r8
187 leaq 16(%rdi, %r8), %rdi
188L(16am):
189 /* Handle two 16B aligned pointers separately. */
190 testq $15, %rsi
191 jz L(ATR)
192 testq $16, %rdi
193 jz L(A32)
194 movdqu (%rdi, %rsi), %xmm0
195 pcmpeqb (%rdi), %xmm0
196 pmovmskb %xmm0, %edx
197 subl $0xffff, %edx
198 jnz L(neq)
199 addq $16, %rdi
200L(A32):
201 movq %r11, %r10
202 andq $-32, %r10
203 cmpq %r10, %rdi
204 jae L(mt16)
205 /* Pre-unroll to be ready for unrolled 64B loop. */
206 testq $32, %rdi
207 jz L(A64)
208 movdqu (%rdi,%rsi), %xmm0
209 pcmpeqb (%rdi), %xmm0
210 pmovmskb %xmm0, %edx
211 subl $0xffff, %edx
212 jnz L(neq)
213 addq $16, %rdi
214
215 movdqu (%rdi,%rsi), %xmm0
216 pcmpeqb (%rdi), %xmm0
217 pmovmskb %xmm0, %edx
218 subl $0xffff, %edx
219 jnz L(neq)
220 addq $16, %rdi
221
222L(A64):
223 movq %r11, %r10
224 andq $-64, %r10
225 cmpq %r10, %rdi
226 jae L(mt32)
227
228L(A64main):
229 movdqu (%rdi,%rsi), %xmm0
230 pcmpeqb (%rdi), %xmm0
231 pmovmskb %xmm0, %edx
232 subl $0xffff, %edx
233 jnz L(neq)
234 addq $16, %rdi
235
236 movdqu (%rdi,%rsi), %xmm0
237 pcmpeqb (%rdi), %xmm0
238 pmovmskb %xmm0, %edx
239 subl $0xffff, %edx
240 jnz L(neq)
241 addq $16, %rdi
242
243 movdqu (%rdi,%rsi), %xmm0
244 pcmpeqb (%rdi), %xmm0
245 pmovmskb %xmm0, %edx
246 subl $0xffff, %edx
247 jnz L(neq)
248 addq $16, %rdi
249
250 movdqu (%rdi,%rsi), %xmm0
251 pcmpeqb (%rdi), %xmm0
252 pmovmskb %xmm0, %edx
253 subl $0xffff, %edx
254 jnz L(neq)
255 addq $16, %rdi
256
257 cmpq %rdi, %r10
258 jne L(A64main)
259
260L(mt32):
261 movq %r11, %r10
262 andq $-32, %r10
263 cmpq %r10, %rdi
264 jae L(mt16)
265
266L(A32main):
267 movdqu (%rdi,%rsi), %xmm0
268 pcmpeqb (%rdi), %xmm0
269 pmovmskb %xmm0, %edx
270 subl $0xffff, %edx
271 jnz L(neq)
272 addq $16, %rdi
273
274 movdqu (%rdi,%rsi), %xmm0
275 pcmpeqb (%rdi), %xmm0
276 pmovmskb %xmm0, %edx
277 subl $0xffff, %edx
278 jnz L(neq)
279 addq $16, %rdi
280
281 cmpq %rdi, %r10
282 jne L(A32main)
283L(mt16):
284 subq %rdi, %r11
285 je L(finz)
286 movq %r11, %r10
287 jmp L(small)
288
289 .p2align 4,, 4
290L(neq):
291#ifdef USE_AS_MEMCMPEQ
292 movl $1, %eax
293 ret
294#else
295 bsfl %edx, %ecx
296 movzbl (%rdi, %rcx), %eax
297 addq %rdi, %rsi
298 movzbl (%rsi,%rcx), %edx
299 jmp L(finz1)
300#endif
301
302 .p2align 4,, 4
303L(ATR):
304 movq %r11, %r10
305 andq $-32, %r10
306 cmpq %r10, %rdi
307 jae L(mt16)
308 testq $16, %rdi
309 jz L(ATR32)
310
311 movdqa (%rdi,%rsi), %xmm0
312 pcmpeqb (%rdi), %xmm0
313 pmovmskb %xmm0, %edx
314 subl $0xffff, %edx
315 jnz L(neq)
316 addq $16, %rdi
317 cmpq %rdi, %r10
318 je L(mt16)
319
320L(ATR32):
321 movq %r11, %r10
322 andq $-64, %r10
323 testq $32, %rdi
324 jz L(ATR64)
325
326 movdqa (%rdi,%rsi), %xmm0
327 pcmpeqb (%rdi), %xmm0
328 pmovmskb %xmm0, %edx
329 subl $0xffff, %edx
330 jnz L(neq)
331 addq $16, %rdi
332
333 movdqa (%rdi,%rsi), %xmm0
334 pcmpeqb (%rdi), %xmm0
335 pmovmskb %xmm0, %edx
336 subl $0xffff, %edx
337 jnz L(neq)
338 addq $16, %rdi
339
340L(ATR64):
341 cmpq %rdi, %r10
342 je L(mt32)
343
344L(ATR64main):
345 movdqa (%rdi,%rsi), %xmm0
346 pcmpeqb (%rdi), %xmm0
347 pmovmskb %xmm0, %edx
348 subl $0xffff, %edx
349 jnz L(neq)
350 addq $16, %rdi
351
352 movdqa (%rdi,%rsi), %xmm0
353 pcmpeqb (%rdi), %xmm0
354 pmovmskb %xmm0, %edx
355 subl $0xffff, %edx
356 jnz L(neq)
357 addq $16, %rdi
358
359 movdqa (%rdi,%rsi), %xmm0
360 pcmpeqb (%rdi), %xmm0
361 pmovmskb %xmm0, %edx
362 subl $0xffff, %edx
363 jnz L(neq)
364 addq $16, %rdi
365
366 movdqa (%rdi,%rsi), %xmm0
367 pcmpeqb (%rdi), %xmm0
368 pmovmskb %xmm0, %edx
369 subl $0xffff, %edx
370 jnz L(neq)
371 addq $16, %rdi
372 cmpq %rdi, %r10
373 jne L(ATR64main)
374
375 movq %r11, %r10
376 andq $-32, %r10
377 cmpq %r10, %rdi
378 jae L(mt16)
379
380L(ATR32res):
381 movdqa (%rdi,%rsi), %xmm0
382 pcmpeqb (%rdi), %xmm0
383 pmovmskb %xmm0, %edx
384 subl $0xffff, %edx
385 jnz L(neq)
386 addq $16, %rdi
387
388 movdqa (%rdi,%rsi), %xmm0
389 pcmpeqb (%rdi), %xmm0
390 pmovmskb %xmm0, %edx
391 subl $0xffff, %edx
392 jnz L(neq)
393 addq $16, %rdi
394
395 cmpq %r10, %rdi
396 jne L(ATR32res)
397
398 subq %rdi, %r11
399 je L(finz)
400 movq %r11, %r10
401 jmp L(small)
402 /* Align to 16byte to improve instruction fetch. */
403 .p2align 4,, 4
404END(memcmp)
405
406#ifdef USE_AS_MEMCMPEQ
407libc_hidden_def (memcmp)
408#else
409# undef bcmp
410weak_alias (memcmp, bcmp)
411libc_hidden_builtin_def (memcmp)
412#endif
413