1 | /* memcmp with SSE2 |
2 | Copyright (C) 2009-2016 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | .text |
23 | ENTRY (memcmp) |
24 | test %rdx, %rdx |
25 | jz L(finz) |
26 | cmpq $1, %rdx |
27 | jle L(finr1b) |
28 | subq %rdi, %rsi |
29 | movq %rdx, %r10 |
30 | cmpq $32, %r10 |
31 | jge L(gt32) |
32 | /* Handle small chunks and last block of less than 32 bytes. */ |
33 | L(small): |
34 | testq $1, %r10 |
35 | jz L(s2b) |
36 | movzbl (%rdi), %eax |
37 | movzbl (%rdi, %rsi), %edx |
38 | subq $1, %r10 |
39 | je L(finz1) |
40 | addq $1, %rdi |
41 | subl %edx, %eax |
42 | jnz L(exit) |
43 | L(s2b): |
44 | testq $2, %r10 |
45 | jz L(s4b) |
46 | movzwl (%rdi), %eax |
47 | movzwl (%rdi, %rsi), %edx |
48 | subq $2, %r10 |
49 | je L(fin2_7) |
50 | addq $2, %rdi |
51 | cmpl %edx, %eax |
52 | jnz L(fin2_7) |
53 | L(s4b): |
54 | testq $4, %r10 |
55 | jz L(s8b) |
56 | movl (%rdi), %eax |
57 | movl (%rdi, %rsi), %edx |
58 | subq $4, %r10 |
59 | je L(fin2_7) |
60 | addq $4, %rdi |
61 | cmpl %edx, %eax |
62 | jnz L(fin2_7) |
63 | L(s8b): |
64 | testq $8, %r10 |
65 | jz L(s16b) |
66 | movq (%rdi), %rax |
67 | movq (%rdi, %rsi), %rdx |
68 | subq $8, %r10 |
69 | je L(fin2_7) |
70 | addq $8, %rdi |
71 | cmpq %rdx, %rax |
72 | jnz L(fin2_7) |
73 | L(s16b): |
74 | movdqu (%rdi), %xmm1 |
75 | movdqu (%rdi, %rsi), %xmm0 |
76 | pcmpeqb %xmm0, %xmm1 |
77 | pmovmskb %xmm1, %edx |
78 | xorl %eax, %eax |
79 | subl $0xffff, %edx |
80 | jz L(finz) |
81 | bsfl %edx, %ecx |
82 | leaq (%rdi, %rcx), %rcx |
83 | movzbl (%rcx), %eax |
84 | movzbl (%rsi, %rcx), %edx |
85 | jmp L(finz1) |
86 | |
87 | .p2align 4,, 4 |
88 | L(finr1b): |
89 | movzbl (%rdi), %eax |
90 | movzbl (%rsi), %edx |
91 | L(finz1): |
92 | subl %edx, %eax |
93 | L(exit): |
94 | ret |
95 | |
96 | .p2align 4,, 4 |
97 | L(fin2_7): |
98 | cmpq %rdx, %rax |
99 | jz L(finz) |
100 | movq %rax, %r11 |
101 | subq %rdx, %r11 |
102 | bsfq %r11, %rcx |
103 | sarq $3, %rcx |
104 | salq $3, %rcx |
105 | sarq %cl, %rax |
106 | movzbl %al, %eax |
107 | sarq %cl, %rdx |
108 | movzbl %dl, %edx |
109 | subl %edx, %eax |
110 | ret |
111 | |
112 | .p2align 4,, 4 |
113 | L(finz): |
114 | xorl %eax, %eax |
115 | ret |
116 | |
117 | /* For blocks bigger than 32 bytes |
118 | 1. Advance one of the addr pointer to be 16B aligned. |
119 | 2. Treat the case of both addr pointers aligned to 16B |
120 | separately to avoid movdqu. |
121 | 3. Handle any blocks of greater than 64 consecutive bytes with |
122 | unrolling to reduce branches. |
123 | 4. At least one addr pointer is 16B aligned, use memory version |
124 | of pcmbeqb. |
125 | */ |
126 | .p2align 4,, 4 |
127 | L(gt32): |
128 | movq %rdx, %r11 |
129 | addq %rdi, %r11 |
130 | movq %rdi, %r8 |
131 | |
132 | andq $15, %r8 |
133 | jz L(16am) |
134 | /* Both pointers may be misaligned. */ |
135 | movdqu (%rdi), %xmm1 |
136 | movdqu (%rdi, %rsi), %xmm0 |
137 | pcmpeqb %xmm0, %xmm1 |
138 | pmovmskb %xmm1, %edx |
139 | subl $0xffff, %edx |
140 | jnz L(neq) |
141 | neg %r8 |
142 | leaq 16(%rdi, %r8), %rdi |
143 | L(16am): |
144 | /* Handle two 16B aligned pointers separately. */ |
145 | testq $15, %rsi |
146 | jz L(ATR) |
147 | testq $16, %rdi |
148 | jz L(A32) |
149 | movdqu (%rdi, %rsi), %xmm0 |
150 | pcmpeqb (%rdi), %xmm0 |
151 | pmovmskb %xmm0, %edx |
152 | subl $0xffff, %edx |
153 | jnz L(neq) |
154 | addq $16, %rdi |
155 | L(A32): |
156 | movq %r11, %r10 |
157 | andq $-32, %r10 |
158 | cmpq %r10, %rdi |
159 | jge L(mt16) |
160 | /* Pre-unroll to be ready for unrolled 64B loop. */ |
161 | testq $32, %rdi |
162 | jz L(A64) |
163 | movdqu (%rdi,%rsi), %xmm0 |
164 | pcmpeqb (%rdi), %xmm0 |
165 | pmovmskb %xmm0, %edx |
166 | subl $0xffff, %edx |
167 | jnz L(neq) |
168 | addq $16, %rdi |
169 | |
170 | movdqu (%rdi,%rsi), %xmm0 |
171 | pcmpeqb (%rdi), %xmm0 |
172 | pmovmskb %xmm0, %edx |
173 | subl $0xffff, %edx |
174 | jnz L(neq) |
175 | addq $16, %rdi |
176 | |
177 | L(A64): |
178 | movq %r11, %r10 |
179 | andq $-64, %r10 |
180 | cmpq %r10, %rdi |
181 | jge L(mt32) |
182 | |
183 | L(A64main): |
184 | movdqu (%rdi,%rsi), %xmm0 |
185 | pcmpeqb (%rdi), %xmm0 |
186 | pmovmskb %xmm0, %edx |
187 | subl $0xffff, %edx |
188 | jnz L(neq) |
189 | addq $16, %rdi |
190 | |
191 | movdqu (%rdi,%rsi), %xmm0 |
192 | pcmpeqb (%rdi), %xmm0 |
193 | pmovmskb %xmm0, %edx |
194 | subl $0xffff, %edx |
195 | jnz L(neq) |
196 | addq $16, %rdi |
197 | |
198 | movdqu (%rdi,%rsi), %xmm0 |
199 | pcmpeqb (%rdi), %xmm0 |
200 | pmovmskb %xmm0, %edx |
201 | subl $0xffff, %edx |
202 | jnz L(neq) |
203 | addq $16, %rdi |
204 | |
205 | movdqu (%rdi,%rsi), %xmm0 |
206 | pcmpeqb (%rdi), %xmm0 |
207 | pmovmskb %xmm0, %edx |
208 | subl $0xffff, %edx |
209 | jnz L(neq) |
210 | addq $16, %rdi |
211 | |
212 | cmpq %rdi, %r10 |
213 | jne L(A64main) |
214 | |
215 | L(mt32): |
216 | movq %r11, %r10 |
217 | andq $-32, %r10 |
218 | cmpq %r10, %rdi |
219 | jge L(mt16) |
220 | |
221 | L(A32main): |
222 | movdqu (%rdi,%rsi), %xmm0 |
223 | pcmpeqb (%rdi), %xmm0 |
224 | pmovmskb %xmm0, %edx |
225 | subl $0xffff, %edx |
226 | jnz L(neq) |
227 | addq $16, %rdi |
228 | |
229 | movdqu (%rdi,%rsi), %xmm0 |
230 | pcmpeqb (%rdi), %xmm0 |
231 | pmovmskb %xmm0, %edx |
232 | subl $0xffff, %edx |
233 | jnz L(neq) |
234 | addq $16, %rdi |
235 | |
236 | cmpq %rdi, %r10 |
237 | jne L(A32main) |
238 | L(mt16): |
239 | subq %rdi, %r11 |
240 | je L(finz) |
241 | movq %r11, %r10 |
242 | jmp L(small) |
243 | |
244 | .p2align 4,, 4 |
245 | L(neq): |
246 | bsfl %edx, %ecx |
247 | movzbl (%rdi, %rcx), %eax |
248 | addq %rdi, %rsi |
249 | movzbl (%rsi,%rcx), %edx |
250 | jmp L(finz1) |
251 | |
252 | .p2align 4,, 4 |
253 | L(ATR): |
254 | movq %r11, %r10 |
255 | andq $-32, %r10 |
256 | cmpq %r10, %rdi |
257 | jge L(mt16) |
258 | testq $16, %rdi |
259 | jz L(ATR32) |
260 | |
261 | movdqa (%rdi,%rsi), %xmm0 |
262 | pcmpeqb (%rdi), %xmm0 |
263 | pmovmskb %xmm0, %edx |
264 | subl $0xffff, %edx |
265 | jnz L(neq) |
266 | addq $16, %rdi |
267 | cmpq %rdi, %r10 |
268 | je L(mt16) |
269 | |
270 | L(ATR32): |
271 | movq %r11, %r10 |
272 | andq $-64, %r10 |
273 | testq $32, %rdi |
274 | jz L(ATR64) |
275 | |
276 | movdqa (%rdi,%rsi), %xmm0 |
277 | pcmpeqb (%rdi), %xmm0 |
278 | pmovmskb %xmm0, %edx |
279 | subl $0xffff, %edx |
280 | jnz L(neq) |
281 | addq $16, %rdi |
282 | |
283 | movdqa (%rdi,%rsi), %xmm0 |
284 | pcmpeqb (%rdi), %xmm0 |
285 | pmovmskb %xmm0, %edx |
286 | subl $0xffff, %edx |
287 | jnz L(neq) |
288 | addq $16, %rdi |
289 | |
290 | L(ATR64): |
291 | cmpq %rdi, %r10 |
292 | je L(mt32) |
293 | |
294 | L(ATR64main): |
295 | movdqa (%rdi,%rsi), %xmm0 |
296 | pcmpeqb (%rdi), %xmm0 |
297 | pmovmskb %xmm0, %edx |
298 | subl $0xffff, %edx |
299 | jnz L(neq) |
300 | addq $16, %rdi |
301 | |
302 | movdqa (%rdi,%rsi), %xmm0 |
303 | pcmpeqb (%rdi), %xmm0 |
304 | pmovmskb %xmm0, %edx |
305 | subl $0xffff, %edx |
306 | jnz L(neq) |
307 | addq $16, %rdi |
308 | |
309 | movdqa (%rdi,%rsi), %xmm0 |
310 | pcmpeqb (%rdi), %xmm0 |
311 | pmovmskb %xmm0, %edx |
312 | subl $0xffff, %edx |
313 | jnz L(neq) |
314 | addq $16, %rdi |
315 | |
316 | movdqa (%rdi,%rsi), %xmm0 |
317 | pcmpeqb (%rdi), %xmm0 |
318 | pmovmskb %xmm0, %edx |
319 | subl $0xffff, %edx |
320 | jnz L(neq) |
321 | addq $16, %rdi |
322 | cmpq %rdi, %r10 |
323 | jne L(ATR64main) |
324 | |
325 | movq %r11, %r10 |
326 | andq $-32, %r10 |
327 | cmpq %r10, %rdi |
328 | jge L(mt16) |
329 | |
330 | L(ATR32res): |
331 | movdqa (%rdi,%rsi), %xmm0 |
332 | pcmpeqb (%rdi), %xmm0 |
333 | pmovmskb %xmm0, %edx |
334 | subl $0xffff, %edx |
335 | jnz L(neq) |
336 | addq $16, %rdi |
337 | |
338 | movdqa (%rdi,%rsi), %xmm0 |
339 | pcmpeqb (%rdi), %xmm0 |
340 | pmovmskb %xmm0, %edx |
341 | subl $0xffff, %edx |
342 | jnz L(neq) |
343 | addq $16, %rdi |
344 | |
345 | cmpq %r10, %rdi |
346 | jne L(ATR32res) |
347 | |
348 | subq %rdi, %r11 |
349 | je L(finz) |
350 | movq %r11, %r10 |
351 | jmp L(small) |
352 | /* Align to 16byte to improve instruction fetch. */ |
353 | .p2align 4,, 4 |
354 | END(memcmp) |
355 | |
356 | #undef bcmp |
357 | weak_alias (memcmp, bcmp) |
358 | libc_hidden_builtin_def (memcmp) |
359 | |