1 | /* memcmp with SSE2 |
2 | Copyright (C) 2009-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | .text |
22 | ENTRY (memcmp) |
23 | #ifdef __ILP32__ |
24 | /* Clear the upper 32 bits. */ |
25 | movl %edx, %edx |
26 | #endif |
27 | test %RDX_LP, %RDX_LP |
28 | jz L(finz) |
29 | cmpq $1, %rdx |
30 | jbe L(finr1b) |
31 | subq %rdi, %rsi |
32 | movq %rdx, %r10 |
33 | cmpq $32, %r10 |
34 | jae L(gt32) |
35 | /* Handle small chunks and last block of less than 32 bytes. */ |
36 | L(small): |
37 | testq $1, %r10 |
38 | jz L(s2b) |
39 | movzbl (%rdi), %eax |
40 | movzbl (%rdi, %rsi), %edx |
41 | subq $1, %r10 |
42 | je L(finz1) |
43 | addq $1, %rdi |
44 | subl %edx, %eax |
45 | jnz L(exit) |
46 | L(s2b): |
47 | testq $2, %r10 |
48 | jz L(s4b) |
49 | movzwl (%rdi), %eax |
50 | movzwl (%rdi, %rsi), %edx |
51 | subq $2, %r10 |
52 | #ifdef USE_AS_MEMCMPEQ |
53 | je L(finz1) |
54 | #else |
55 | je L(fin2_7) |
56 | #endif |
57 | addq $2, %rdi |
58 | cmpl %edx, %eax |
59 | #ifdef USE_AS_MEMCMPEQ |
60 | jnz L(neq_early) |
61 | #else |
62 | jnz L(fin2_7) |
63 | #endif |
64 | L(s4b): |
65 | testq $4, %r10 |
66 | jz L(s8b) |
67 | movl (%rdi), %eax |
68 | movl (%rdi, %rsi), %edx |
69 | subq $4, %r10 |
70 | #ifdef USE_AS_MEMCMPEQ |
71 | je L(finz1) |
72 | #else |
73 | je L(fin2_7) |
74 | #endif |
75 | addq $4, %rdi |
76 | cmpl %edx, %eax |
77 | #ifdef USE_AS_MEMCMPEQ |
78 | jnz L(neq_early) |
79 | #else |
80 | jnz L(fin2_7) |
81 | #endif |
82 | L(s8b): |
83 | testq $8, %r10 |
84 | jz L(s16b) |
85 | movq (%rdi), %rax |
86 | movq (%rdi, %rsi), %rdx |
87 | subq $8, %r10 |
88 | #ifdef USE_AS_MEMCMPEQ |
89 | je L(sub_return8) |
90 | #else |
91 | je L(fin2_7) |
92 | #endif |
93 | addq $8, %rdi |
94 | cmpq %rdx, %rax |
95 | #ifdef USE_AS_MEMCMPEQ |
96 | jnz L(neq_early) |
97 | #else |
98 | jnz L(fin2_7) |
99 | #endif |
100 | L(s16b): |
101 | movdqu (%rdi), %xmm1 |
102 | movdqu (%rdi, %rsi), %xmm0 |
103 | pcmpeqb %xmm0, %xmm1 |
104 | #ifdef USE_AS_MEMCMPEQ |
105 | pmovmskb %xmm1, %eax |
106 | subl $0xffff, %eax |
107 | ret |
108 | #else |
109 | pmovmskb %xmm1, %edx |
110 | xorl %eax, %eax |
111 | subl $0xffff, %edx |
112 | jz L(finz) |
113 | bsfl %edx, %ecx |
114 | leaq (%rdi, %rcx), %rcx |
115 | movzbl (%rcx), %eax |
116 | movzbl (%rsi, %rcx), %edx |
117 | jmp L(finz1) |
118 | #endif |
119 | .p2align 4,, 4 |
120 | L(finr1b): |
121 | movzbl (%rdi), %eax |
122 | movzbl (%rsi), %edx |
123 | L(finz1): |
124 | subl %edx, %eax |
125 | L(exit): |
126 | ret |
127 | #ifdef USE_AS_MEMCMPEQ |
128 | .p2align 4,, 4 |
129 | L(sub_return8): |
130 | subq %rdx, %rax |
131 | movl %eax, %edx |
132 | shrq $32, %rax |
133 | orl %edx, %eax |
134 | ret |
135 | #else |
136 | .p2align 4,, 4 |
137 | L(fin2_7): |
138 | cmpq %rdx, %rax |
139 | jz L(finz) |
140 | movq %rax, %r11 |
141 | subq %rdx, %r11 |
142 | bsfq %r11, %rcx |
143 | sarq $3, %rcx |
144 | salq $3, %rcx |
145 | sarq %cl, %rax |
146 | movzbl %al, %eax |
147 | sarq %cl, %rdx |
148 | movzbl %dl, %edx |
149 | subl %edx, %eax |
150 | ret |
151 | #endif |
152 | .p2align 4,, 4 |
153 | L(finz): |
154 | xorl %eax, %eax |
155 | ret |
156 | #ifdef USE_AS_MEMCMPEQ |
157 | .p2align 4,, 4 |
158 | L(neq_early): |
159 | movl $1, %eax |
160 | ret |
161 | #endif |
162 | /* For blocks bigger than 32 bytes |
163 | 1. Advance one of the addr pointer to be 16B aligned. |
164 | 2. Treat the case of both addr pointers aligned to 16B |
165 | separately to avoid movdqu. |
166 | 3. Handle any blocks of greater than 64 consecutive bytes with |
167 | unrolling to reduce branches. |
168 | 4. At least one addr pointer is 16B aligned, use memory version |
169 | of pcmbeqb. |
170 | */ |
171 | .p2align 4,, 4 |
172 | L(gt32): |
173 | movq %rdx, %r11 |
174 | addq %rdi, %r11 |
175 | movq %rdi, %r8 |
176 | |
177 | andq $15, %r8 |
178 | jz L(16am) |
179 | /* Both pointers may be misaligned. */ |
180 | movdqu (%rdi), %xmm1 |
181 | movdqu (%rdi, %rsi), %xmm0 |
182 | pcmpeqb %xmm0, %xmm1 |
183 | pmovmskb %xmm1, %edx |
184 | subl $0xffff, %edx |
185 | jnz L(neq) |
186 | neg %r8 |
187 | leaq 16(%rdi, %r8), %rdi |
188 | L(16am): |
189 | /* Handle two 16B aligned pointers separately. */ |
190 | testq $15, %rsi |
191 | jz L(ATR) |
192 | testq $16, %rdi |
193 | jz L(A32) |
194 | movdqu (%rdi, %rsi), %xmm0 |
195 | pcmpeqb (%rdi), %xmm0 |
196 | pmovmskb %xmm0, %edx |
197 | subl $0xffff, %edx |
198 | jnz L(neq) |
199 | addq $16, %rdi |
200 | L(A32): |
201 | movq %r11, %r10 |
202 | andq $-32, %r10 |
203 | cmpq %r10, %rdi |
204 | jae L(mt16) |
205 | /* Pre-unroll to be ready for unrolled 64B loop. */ |
206 | testq $32, %rdi |
207 | jz L(A64) |
208 | movdqu (%rdi,%rsi), %xmm0 |
209 | pcmpeqb (%rdi), %xmm0 |
210 | pmovmskb %xmm0, %edx |
211 | subl $0xffff, %edx |
212 | jnz L(neq) |
213 | addq $16, %rdi |
214 | |
215 | movdqu (%rdi,%rsi), %xmm0 |
216 | pcmpeqb (%rdi), %xmm0 |
217 | pmovmskb %xmm0, %edx |
218 | subl $0xffff, %edx |
219 | jnz L(neq) |
220 | addq $16, %rdi |
221 | |
222 | L(A64): |
223 | movq %r11, %r10 |
224 | andq $-64, %r10 |
225 | cmpq %r10, %rdi |
226 | jae L(mt32) |
227 | |
228 | L(A64main): |
229 | movdqu (%rdi,%rsi), %xmm0 |
230 | pcmpeqb (%rdi), %xmm0 |
231 | pmovmskb %xmm0, %edx |
232 | subl $0xffff, %edx |
233 | jnz L(neq) |
234 | addq $16, %rdi |
235 | |
236 | movdqu (%rdi,%rsi), %xmm0 |
237 | pcmpeqb (%rdi), %xmm0 |
238 | pmovmskb %xmm0, %edx |
239 | subl $0xffff, %edx |
240 | jnz L(neq) |
241 | addq $16, %rdi |
242 | |
243 | movdqu (%rdi,%rsi), %xmm0 |
244 | pcmpeqb (%rdi), %xmm0 |
245 | pmovmskb %xmm0, %edx |
246 | subl $0xffff, %edx |
247 | jnz L(neq) |
248 | addq $16, %rdi |
249 | |
250 | movdqu (%rdi,%rsi), %xmm0 |
251 | pcmpeqb (%rdi), %xmm0 |
252 | pmovmskb %xmm0, %edx |
253 | subl $0xffff, %edx |
254 | jnz L(neq) |
255 | addq $16, %rdi |
256 | |
257 | cmpq %rdi, %r10 |
258 | jne L(A64main) |
259 | |
260 | L(mt32): |
261 | movq %r11, %r10 |
262 | andq $-32, %r10 |
263 | cmpq %r10, %rdi |
264 | jae L(mt16) |
265 | |
266 | L(A32main): |
267 | movdqu (%rdi,%rsi), %xmm0 |
268 | pcmpeqb (%rdi), %xmm0 |
269 | pmovmskb %xmm0, %edx |
270 | subl $0xffff, %edx |
271 | jnz L(neq) |
272 | addq $16, %rdi |
273 | |
274 | movdqu (%rdi,%rsi), %xmm0 |
275 | pcmpeqb (%rdi), %xmm0 |
276 | pmovmskb %xmm0, %edx |
277 | subl $0xffff, %edx |
278 | jnz L(neq) |
279 | addq $16, %rdi |
280 | |
281 | cmpq %rdi, %r10 |
282 | jne L(A32main) |
283 | L(mt16): |
284 | subq %rdi, %r11 |
285 | je L(finz) |
286 | movq %r11, %r10 |
287 | jmp L(small) |
288 | |
289 | .p2align 4,, 4 |
290 | L(neq): |
291 | #ifdef USE_AS_MEMCMPEQ |
292 | movl $1, %eax |
293 | ret |
294 | #else |
295 | bsfl %edx, %ecx |
296 | movzbl (%rdi, %rcx), %eax |
297 | addq %rdi, %rsi |
298 | movzbl (%rsi,%rcx), %edx |
299 | jmp L(finz1) |
300 | #endif |
301 | |
302 | .p2align 4,, 4 |
303 | L(ATR): |
304 | movq %r11, %r10 |
305 | andq $-32, %r10 |
306 | cmpq %r10, %rdi |
307 | jae L(mt16) |
308 | testq $16, %rdi |
309 | jz L(ATR32) |
310 | |
311 | movdqa (%rdi,%rsi), %xmm0 |
312 | pcmpeqb (%rdi), %xmm0 |
313 | pmovmskb %xmm0, %edx |
314 | subl $0xffff, %edx |
315 | jnz L(neq) |
316 | addq $16, %rdi |
317 | cmpq %rdi, %r10 |
318 | je L(mt16) |
319 | |
320 | L(ATR32): |
321 | movq %r11, %r10 |
322 | andq $-64, %r10 |
323 | testq $32, %rdi |
324 | jz L(ATR64) |
325 | |
326 | movdqa (%rdi,%rsi), %xmm0 |
327 | pcmpeqb (%rdi), %xmm0 |
328 | pmovmskb %xmm0, %edx |
329 | subl $0xffff, %edx |
330 | jnz L(neq) |
331 | addq $16, %rdi |
332 | |
333 | movdqa (%rdi,%rsi), %xmm0 |
334 | pcmpeqb (%rdi), %xmm0 |
335 | pmovmskb %xmm0, %edx |
336 | subl $0xffff, %edx |
337 | jnz L(neq) |
338 | addq $16, %rdi |
339 | |
340 | L(ATR64): |
341 | cmpq %rdi, %r10 |
342 | je L(mt32) |
343 | |
344 | L(ATR64main): |
345 | movdqa (%rdi,%rsi), %xmm0 |
346 | pcmpeqb (%rdi), %xmm0 |
347 | pmovmskb %xmm0, %edx |
348 | subl $0xffff, %edx |
349 | jnz L(neq) |
350 | addq $16, %rdi |
351 | |
352 | movdqa (%rdi,%rsi), %xmm0 |
353 | pcmpeqb (%rdi), %xmm0 |
354 | pmovmskb %xmm0, %edx |
355 | subl $0xffff, %edx |
356 | jnz L(neq) |
357 | addq $16, %rdi |
358 | |
359 | movdqa (%rdi,%rsi), %xmm0 |
360 | pcmpeqb (%rdi), %xmm0 |
361 | pmovmskb %xmm0, %edx |
362 | subl $0xffff, %edx |
363 | jnz L(neq) |
364 | addq $16, %rdi |
365 | |
366 | movdqa (%rdi,%rsi), %xmm0 |
367 | pcmpeqb (%rdi), %xmm0 |
368 | pmovmskb %xmm0, %edx |
369 | subl $0xffff, %edx |
370 | jnz L(neq) |
371 | addq $16, %rdi |
372 | cmpq %rdi, %r10 |
373 | jne L(ATR64main) |
374 | |
375 | movq %r11, %r10 |
376 | andq $-32, %r10 |
377 | cmpq %r10, %rdi |
378 | jae L(mt16) |
379 | |
380 | L(ATR32res): |
381 | movdqa (%rdi,%rsi), %xmm0 |
382 | pcmpeqb (%rdi), %xmm0 |
383 | pmovmskb %xmm0, %edx |
384 | subl $0xffff, %edx |
385 | jnz L(neq) |
386 | addq $16, %rdi |
387 | |
388 | movdqa (%rdi,%rsi), %xmm0 |
389 | pcmpeqb (%rdi), %xmm0 |
390 | pmovmskb %xmm0, %edx |
391 | subl $0xffff, %edx |
392 | jnz L(neq) |
393 | addq $16, %rdi |
394 | |
395 | cmpq %r10, %rdi |
396 | jne L(ATR32res) |
397 | |
398 | subq %rdi, %r11 |
399 | je L(finz) |
400 | movq %r11, %r10 |
401 | jmp L(small) |
402 | /* Align to 16byte to improve instruction fetch. */ |
403 | .p2align 4,, 4 |
404 | END(memcmp) |
405 | |
406 | #ifdef USE_AS_MEMCMPEQ |
407 | libc_hidden_def (memcmp) |
408 | #else |
409 | # undef bcmp |
410 | weak_alias (memcmp, bcmp) |
411 | libc_hidden_builtin_def (memcmp) |
412 | #endif |
413 | |