1 | /* memcmp with SSE2 |
2 | Copyright (C) 2009-2021 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | .text |
23 | ENTRY (memcmp) |
24 | #ifdef __ILP32__ |
25 | /* Clear the upper 32 bits. */ |
26 | movl %edx, %edx |
27 | #endif |
28 | test %RDX_LP, %RDX_LP |
29 | jz L(finz) |
30 | cmpq $1, %rdx |
31 | jbe L(finr1b) |
32 | subq %rdi, %rsi |
33 | movq %rdx, %r10 |
34 | cmpq $32, %r10 |
35 | jae L(gt32) |
36 | /* Handle small chunks and last block of less than 32 bytes. */ |
37 | L(small): |
38 | testq $1, %r10 |
39 | jz L(s2b) |
40 | movzbl (%rdi), %eax |
41 | movzbl (%rdi, %rsi), %edx |
42 | subq $1, %r10 |
43 | je L(finz1) |
44 | addq $1, %rdi |
45 | subl %edx, %eax |
46 | jnz L(exit) |
47 | L(s2b): |
48 | testq $2, %r10 |
49 | jz L(s4b) |
50 | movzwl (%rdi), %eax |
51 | movzwl (%rdi, %rsi), %edx |
52 | subq $2, %r10 |
53 | je L(fin2_7) |
54 | addq $2, %rdi |
55 | cmpl %edx, %eax |
56 | jnz L(fin2_7) |
57 | L(s4b): |
58 | testq $4, %r10 |
59 | jz L(s8b) |
60 | movl (%rdi), %eax |
61 | movl (%rdi, %rsi), %edx |
62 | subq $4, %r10 |
63 | je L(fin2_7) |
64 | addq $4, %rdi |
65 | cmpl %edx, %eax |
66 | jnz L(fin2_7) |
67 | L(s8b): |
68 | testq $8, %r10 |
69 | jz L(s16b) |
70 | movq (%rdi), %rax |
71 | movq (%rdi, %rsi), %rdx |
72 | subq $8, %r10 |
73 | je L(fin2_7) |
74 | addq $8, %rdi |
75 | cmpq %rdx, %rax |
76 | jnz L(fin2_7) |
77 | L(s16b): |
78 | movdqu (%rdi), %xmm1 |
79 | movdqu (%rdi, %rsi), %xmm0 |
80 | pcmpeqb %xmm0, %xmm1 |
81 | pmovmskb %xmm1, %edx |
82 | xorl %eax, %eax |
83 | subl $0xffff, %edx |
84 | jz L(finz) |
85 | bsfl %edx, %ecx |
86 | leaq (%rdi, %rcx), %rcx |
87 | movzbl (%rcx), %eax |
88 | movzbl (%rsi, %rcx), %edx |
89 | jmp L(finz1) |
90 | |
91 | .p2align 4,, 4 |
92 | L(finr1b): |
93 | movzbl (%rdi), %eax |
94 | movzbl (%rsi), %edx |
95 | L(finz1): |
96 | subl %edx, %eax |
97 | L(exit): |
98 | ret |
99 | |
100 | .p2align 4,, 4 |
101 | L(fin2_7): |
102 | cmpq %rdx, %rax |
103 | jz L(finz) |
104 | movq %rax, %r11 |
105 | subq %rdx, %r11 |
106 | bsfq %r11, %rcx |
107 | sarq $3, %rcx |
108 | salq $3, %rcx |
109 | sarq %cl, %rax |
110 | movzbl %al, %eax |
111 | sarq %cl, %rdx |
112 | movzbl %dl, %edx |
113 | subl %edx, %eax |
114 | ret |
115 | |
116 | .p2align 4,, 4 |
117 | L(finz): |
118 | xorl %eax, %eax |
119 | ret |
120 | |
121 | /* For blocks bigger than 32 bytes |
122 | 1. Advance one of the addr pointer to be 16B aligned. |
123 | 2. Treat the case of both addr pointers aligned to 16B |
124 | separately to avoid movdqu. |
125 | 3. Handle any blocks of greater than 64 consecutive bytes with |
126 | unrolling to reduce branches. |
127 | 4. At least one addr pointer is 16B aligned, use memory version |
128 | of pcmbeqb. |
129 | */ |
130 | .p2align 4,, 4 |
131 | L(gt32): |
132 | movq %rdx, %r11 |
133 | addq %rdi, %r11 |
134 | movq %rdi, %r8 |
135 | |
136 | andq $15, %r8 |
137 | jz L(16am) |
138 | /* Both pointers may be misaligned. */ |
139 | movdqu (%rdi), %xmm1 |
140 | movdqu (%rdi, %rsi), %xmm0 |
141 | pcmpeqb %xmm0, %xmm1 |
142 | pmovmskb %xmm1, %edx |
143 | subl $0xffff, %edx |
144 | jnz L(neq) |
145 | neg %r8 |
146 | leaq 16(%rdi, %r8), %rdi |
147 | L(16am): |
148 | /* Handle two 16B aligned pointers separately. */ |
149 | testq $15, %rsi |
150 | jz L(ATR) |
151 | testq $16, %rdi |
152 | jz L(A32) |
153 | movdqu (%rdi, %rsi), %xmm0 |
154 | pcmpeqb (%rdi), %xmm0 |
155 | pmovmskb %xmm0, %edx |
156 | subl $0xffff, %edx |
157 | jnz L(neq) |
158 | addq $16, %rdi |
159 | L(A32): |
160 | movq %r11, %r10 |
161 | andq $-32, %r10 |
162 | cmpq %r10, %rdi |
163 | jae L(mt16) |
164 | /* Pre-unroll to be ready for unrolled 64B loop. */ |
165 | testq $32, %rdi |
166 | jz L(A64) |
167 | movdqu (%rdi,%rsi), %xmm0 |
168 | pcmpeqb (%rdi), %xmm0 |
169 | pmovmskb %xmm0, %edx |
170 | subl $0xffff, %edx |
171 | jnz L(neq) |
172 | addq $16, %rdi |
173 | |
174 | movdqu (%rdi,%rsi), %xmm0 |
175 | pcmpeqb (%rdi), %xmm0 |
176 | pmovmskb %xmm0, %edx |
177 | subl $0xffff, %edx |
178 | jnz L(neq) |
179 | addq $16, %rdi |
180 | |
181 | L(A64): |
182 | movq %r11, %r10 |
183 | andq $-64, %r10 |
184 | cmpq %r10, %rdi |
185 | jae L(mt32) |
186 | |
187 | L(A64main): |
188 | movdqu (%rdi,%rsi), %xmm0 |
189 | pcmpeqb (%rdi), %xmm0 |
190 | pmovmskb %xmm0, %edx |
191 | subl $0xffff, %edx |
192 | jnz L(neq) |
193 | addq $16, %rdi |
194 | |
195 | movdqu (%rdi,%rsi), %xmm0 |
196 | pcmpeqb (%rdi), %xmm0 |
197 | pmovmskb %xmm0, %edx |
198 | subl $0xffff, %edx |
199 | jnz L(neq) |
200 | addq $16, %rdi |
201 | |
202 | movdqu (%rdi,%rsi), %xmm0 |
203 | pcmpeqb (%rdi), %xmm0 |
204 | pmovmskb %xmm0, %edx |
205 | subl $0xffff, %edx |
206 | jnz L(neq) |
207 | addq $16, %rdi |
208 | |
209 | movdqu (%rdi,%rsi), %xmm0 |
210 | pcmpeqb (%rdi), %xmm0 |
211 | pmovmskb %xmm0, %edx |
212 | subl $0xffff, %edx |
213 | jnz L(neq) |
214 | addq $16, %rdi |
215 | |
216 | cmpq %rdi, %r10 |
217 | jne L(A64main) |
218 | |
219 | L(mt32): |
220 | movq %r11, %r10 |
221 | andq $-32, %r10 |
222 | cmpq %r10, %rdi |
223 | jae L(mt16) |
224 | |
225 | L(A32main): |
226 | movdqu (%rdi,%rsi), %xmm0 |
227 | pcmpeqb (%rdi), %xmm0 |
228 | pmovmskb %xmm0, %edx |
229 | subl $0xffff, %edx |
230 | jnz L(neq) |
231 | addq $16, %rdi |
232 | |
233 | movdqu (%rdi,%rsi), %xmm0 |
234 | pcmpeqb (%rdi), %xmm0 |
235 | pmovmskb %xmm0, %edx |
236 | subl $0xffff, %edx |
237 | jnz L(neq) |
238 | addq $16, %rdi |
239 | |
240 | cmpq %rdi, %r10 |
241 | jne L(A32main) |
242 | L(mt16): |
243 | subq %rdi, %r11 |
244 | je L(finz) |
245 | movq %r11, %r10 |
246 | jmp L(small) |
247 | |
248 | .p2align 4,, 4 |
249 | L(neq): |
250 | bsfl %edx, %ecx |
251 | movzbl (%rdi, %rcx), %eax |
252 | addq %rdi, %rsi |
253 | movzbl (%rsi,%rcx), %edx |
254 | jmp L(finz1) |
255 | |
256 | .p2align 4,, 4 |
257 | L(ATR): |
258 | movq %r11, %r10 |
259 | andq $-32, %r10 |
260 | cmpq %r10, %rdi |
261 | jae L(mt16) |
262 | testq $16, %rdi |
263 | jz L(ATR32) |
264 | |
265 | movdqa (%rdi,%rsi), %xmm0 |
266 | pcmpeqb (%rdi), %xmm0 |
267 | pmovmskb %xmm0, %edx |
268 | subl $0xffff, %edx |
269 | jnz L(neq) |
270 | addq $16, %rdi |
271 | cmpq %rdi, %r10 |
272 | je L(mt16) |
273 | |
274 | L(ATR32): |
275 | movq %r11, %r10 |
276 | andq $-64, %r10 |
277 | testq $32, %rdi |
278 | jz L(ATR64) |
279 | |
280 | movdqa (%rdi,%rsi), %xmm0 |
281 | pcmpeqb (%rdi), %xmm0 |
282 | pmovmskb %xmm0, %edx |
283 | subl $0xffff, %edx |
284 | jnz L(neq) |
285 | addq $16, %rdi |
286 | |
287 | movdqa (%rdi,%rsi), %xmm0 |
288 | pcmpeqb (%rdi), %xmm0 |
289 | pmovmskb %xmm0, %edx |
290 | subl $0xffff, %edx |
291 | jnz L(neq) |
292 | addq $16, %rdi |
293 | |
294 | L(ATR64): |
295 | cmpq %rdi, %r10 |
296 | je L(mt32) |
297 | |
298 | L(ATR64main): |
299 | movdqa (%rdi,%rsi), %xmm0 |
300 | pcmpeqb (%rdi), %xmm0 |
301 | pmovmskb %xmm0, %edx |
302 | subl $0xffff, %edx |
303 | jnz L(neq) |
304 | addq $16, %rdi |
305 | |
306 | movdqa (%rdi,%rsi), %xmm0 |
307 | pcmpeqb (%rdi), %xmm0 |
308 | pmovmskb %xmm0, %edx |
309 | subl $0xffff, %edx |
310 | jnz L(neq) |
311 | addq $16, %rdi |
312 | |
313 | movdqa (%rdi,%rsi), %xmm0 |
314 | pcmpeqb (%rdi), %xmm0 |
315 | pmovmskb %xmm0, %edx |
316 | subl $0xffff, %edx |
317 | jnz L(neq) |
318 | addq $16, %rdi |
319 | |
320 | movdqa (%rdi,%rsi), %xmm0 |
321 | pcmpeqb (%rdi), %xmm0 |
322 | pmovmskb %xmm0, %edx |
323 | subl $0xffff, %edx |
324 | jnz L(neq) |
325 | addq $16, %rdi |
326 | cmpq %rdi, %r10 |
327 | jne L(ATR64main) |
328 | |
329 | movq %r11, %r10 |
330 | andq $-32, %r10 |
331 | cmpq %r10, %rdi |
332 | jae L(mt16) |
333 | |
334 | L(ATR32res): |
335 | movdqa (%rdi,%rsi), %xmm0 |
336 | pcmpeqb (%rdi), %xmm0 |
337 | pmovmskb %xmm0, %edx |
338 | subl $0xffff, %edx |
339 | jnz L(neq) |
340 | addq $16, %rdi |
341 | |
342 | movdqa (%rdi,%rsi), %xmm0 |
343 | pcmpeqb (%rdi), %xmm0 |
344 | pmovmskb %xmm0, %edx |
345 | subl $0xffff, %edx |
346 | jnz L(neq) |
347 | addq $16, %rdi |
348 | |
349 | cmpq %r10, %rdi |
350 | jne L(ATR32res) |
351 | |
352 | subq %rdi, %r11 |
353 | je L(finz) |
354 | movq %r11, %r10 |
355 | jmp L(small) |
356 | /* Align to 16byte to improve instruction fetch. */ |
357 | .p2align 4,, 4 |
358 | END(memcmp) |
359 | |
360 | #undef bcmp |
361 | weak_alias (memcmp, bcmp) |
362 | libc_hidden_builtin_def (memcmp) |
363 | |