1 | /* memcmp/wmemcmp optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | |
24 | /* memcmp/wmemcmp is implemented as: |
25 | 1. Use ymm vector compares when possible. The only case where |
26 | vector compares is not possible for when size < CHAR_PER_VEC |
27 | and loading from either s1 or s2 would cause a page cross. |
28 | 2. For size from 2 to 7 bytes on page cross, load as big endian |
29 | with movbe and bswap to avoid branches. |
30 | 3. Use xmm vector compare when size >= 4 bytes for memcmp or |
31 | size >= 8 bytes for wmemcmp. |
32 | 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a |
33 | to check for early mismatches. Only do this if its guranteed the |
34 | work is not wasted. |
35 | 5. If size is 8 * VEC_SIZE or less, unroll the loop. |
36 | 6. Compare 4 * VEC_SIZE at a time with the aligned first memory |
37 | area. |
38 | 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. |
39 | 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. |
40 | 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. |
41 | |
42 | When possible the implementation tries to optimize for frontend in the |
43 | following ways: |
44 | Throughput: |
45 | 1. All code sections that fit are able to run optimally out of the |
46 | LSD. |
47 | 2. All code sections that fit are able to run optimally out of the |
48 | DSB |
49 | 3. Basic blocks are contained in minimum number of fetch blocks |
50 | necessary. |
51 | |
52 | Latency: |
53 | 1. Logically connected basic blocks are put in the same |
54 | cache-line. |
55 | 2. Logically connected basic blocks that do not fit in the same |
56 | cache-line are put in adjacent lines. This can get beneficial |
57 | L2 spatial prefetching and L1 next-line prefetching. */ |
58 | |
59 | # include <sysdep.h> |
60 | |
61 | # ifndef MEMCMP |
62 | # define MEMCMP __memcmp_evex_movbe |
63 | # endif |
64 | |
65 | # ifndef VEC_SIZE |
66 | # include "x86-evex256-vecs.h" |
67 | # endif |
68 | |
69 | # ifdef USE_AS_WMEMCMP |
70 | # define VMOVU_MASK vmovdqu32 |
71 | # define CHAR_SIZE 4 |
72 | # define VPCMP vpcmpd |
73 | # define VPCMPEQ vpcmpeqd |
74 | # define VPTEST vptestmd |
75 | |
76 | # define USE_WIDE_CHAR |
77 | # else |
78 | # define VMOVU_MASK vmovdqu8 |
79 | # define CHAR_SIZE 1 |
80 | # define VPCMP vpcmpub |
81 | # define VPCMPEQ vpcmpeqb |
82 | # define VPTEST vptestmb |
83 | # endif |
84 | |
85 | # include "reg-macros.h" |
86 | |
87 | # define PAGE_SIZE 4096 |
88 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
89 | |
90 | |
91 | /* Warning! |
92 | wmemcmp has to use SIGNED comparison for elements. |
93 | memcmp has to use UNSIGNED comparison for elemnts. |
94 | */ |
95 | |
96 | .section SECTION(.text), "ax" , @progbits |
97 | /* Cache align memcmp entry. This allows for much more thorough |
98 | frontend optimization. */ |
99 | ENTRY_P2ALIGN (MEMCMP, 6) |
100 | # ifdef __ILP32__ |
101 | /* Clear the upper 32 bits. */ |
102 | movl %edx, %edx |
103 | # endif |
104 | cmp $CHAR_PER_VEC, %RDX_LP |
105 | /* Fall through for [0, VEC_SIZE] as its the hottest. */ |
106 | ja L(more_1x_vec) |
107 | |
108 | /* Create mask of bytes that are guranteed to be valid because |
109 | of length (edx). Using masked movs allows us to skip checks |
110 | for page crosses/zero size. */ |
111 | mov $-1, %VRAX |
112 | bzhi %VRDX, %VRAX, %VRAX |
113 | /* NB: A `jz` might be useful here. Page-faults that are |
114 | invalidated by predicate execution (the evex mask) can be |
115 | very slow. The expectation is this is not the norm so and |
116 | "most" code will not regularly call 'memcmp' with length = 0 |
117 | and memory that is not wired up. */ |
118 | KMOV %VRAX, %k2 |
119 | |
120 | |
121 | |
122 | /* Safe to load full ymm with mask. */ |
123 | VMOVU_MASK (%rsi), %VMM(2){%k2}{z} |
124 | /* Slightly different method for VEC_SIZE == 64 to save a bit of |
125 | code size. This allows us to fit L(return_vec_0) entirely in |
126 | the first cache line. */ |
127 | # if VEC_SIZE == 64 |
128 | VPCMPEQ (%rdi), %VMM(2), %k1{%k2} |
129 | KMOV %k1, %VRCX |
130 | sub %VRCX, %VRAX |
131 | # else |
132 | VPCMP $4, (%rdi), %VMM(2), %k1{%k2} |
133 | KMOV %k1, %VRAX |
134 | test %VRAX, %VRAX |
135 | # endif |
136 | jnz L(return_vec_0) |
137 | ret |
138 | |
139 | .p2align 4,, 11 |
140 | L(return_vec_0): |
141 | bsf %VRAX, %VRAX |
142 | # ifdef USE_AS_WMEMCMP |
143 | movl (%rdi, %rax, CHAR_SIZE), %ecx |
144 | xorl %edx, %edx |
145 | cmpl (%rsi, %rax, CHAR_SIZE), %ecx |
146 | /* NB: no partial register stall here because xorl zero idiom |
147 | above. */ |
148 | setg %dl |
149 | leal -1(%rdx, %rdx), %eax |
150 | # else |
151 | movzbl (%rsi, %rax), %ecx |
152 | # if VEC_SIZE == 64 |
153 | movb (%rdi, %rax), %al |
154 | # else |
155 | movzbl (%rdi, %rax), %eax |
156 | # endif |
157 | subl %ecx, %eax |
158 | # endif |
159 | ret |
160 | |
161 | .p2align 4,, 11 |
162 | L(more_1x_vec): |
163 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
164 | VMOVU (%rsi), %VMM(1) |
165 | /* Use compare not equals to directly check for mismatch. */ |
166 | VPCMP $4, (%rdi), %VMM(1), %k1 |
167 | KMOV %k1, %VRAX |
168 | /* NB: eax must be destination register if going to |
169 | L(return_vec_[0,2]). For L(return_vec_3) destination |
170 | register must be ecx. */ |
171 | test %VRAX, %VRAX |
172 | jnz L(return_vec_0) |
173 | |
174 | cmpq $(CHAR_PER_VEC * 2), %rdx |
175 | jbe L(last_1x_vec) |
176 | |
177 | /* Check second VEC no matter what. */ |
178 | VMOVU VEC_SIZE(%rsi), %VMM(2) |
179 | VPCMP $4, VEC_SIZE(%rdi), %VMM(2), %k1 |
180 | KMOV %k1, %VRAX |
181 | test %VRAX, %VRAX |
182 | jnz L(return_vec_1) |
183 | |
184 | /* Less than 4 * VEC. */ |
185 | cmpq $(CHAR_PER_VEC * 4), %rdx |
186 | jbe L(last_2x_vec) |
187 | |
188 | /* Check third and fourth VEC no matter what. */ |
189 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) |
190 | VPCMP $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1 |
191 | KMOV %k1, %VRAX |
192 | test %VRAX, %VRAX |
193 | jnz L(return_vec_2) |
194 | |
195 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) |
196 | VPCMP $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1 |
197 | KMOV %k1, %VRCX |
198 | test %VRCX, %VRCX |
199 | jnz L(return_vec_3) |
200 | |
201 | /* Go to 4x VEC loop. */ |
202 | cmpq $(CHAR_PER_VEC * 8), %rdx |
203 | ja L(more_8x_vec) |
204 | |
205 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
206 | branches. */ |
207 | |
208 | /* Load first two VEC from s2 before adjusting addresses. */ |
209 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1) |
210 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2) |
211 | leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi |
212 | leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi |
213 | |
214 | /* Wait to load from s1 until addressed adjust due to |
215 | unlamination of microfusion with complex address mode. */ |
216 | |
217 | /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it |
218 | will have some 1s. */ |
219 | vpxorq (%rdi), %VMM(1), %VMM(1) |
220 | vpxorq (VEC_SIZE)(%rdi), %VMM(2), %VMM(2) |
221 | |
222 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) |
223 | vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) |
224 | |
225 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) |
226 | /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while |
227 | oring with VEC(1). Result is stored in VEC(4). */ |
228 | vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4) |
229 | |
230 | /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */ |
231 | vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) |
232 | |
233 | /* Test VEC(4) against itself. Store any CHAR mismatches in k1. |
234 | */ |
235 | VPTEST %VMM(4), %VMM(4), %k1 |
236 | /* k1 must go to ecx for L(return_vec_0_1_2_3). */ |
237 | KMOV %k1, %VRCX |
238 | test %VRCX, %VRCX |
239 | jnz L(return_vec_0_1_2_3) |
240 | /* NB: eax must be zero to reach here. */ |
241 | ret |
242 | |
243 | |
244 | .p2align 4,, 9 |
245 | L(8x_end_return_vec_0_1_2_3): |
246 | movq %rdx, %rdi |
247 | L(8x_return_vec_0_1_2_3): |
248 | /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64. */ |
249 | # if VEC_SIZE == 64 |
250 | KMOV %k1, %VRCX |
251 | # endif |
252 | addq %rdi, %rsi |
253 | L(return_vec_0_1_2_3): |
254 | VPTEST %VMM(1), %VMM(1), %k0 |
255 | KMOV %k0, %VRAX |
256 | test %VRAX, %VRAX |
257 | jnz L(return_vec_0) |
258 | |
259 | VPTEST %VMM(2), %VMM(2), %k0 |
260 | KMOV %k0, %VRAX |
261 | test %VRAX, %VRAX |
262 | jnz L(return_vec_1) |
263 | |
264 | VPTEST %VMM(3), %VMM(3), %k0 |
265 | KMOV %k0, %VRAX |
266 | test %VRAX, %VRAX |
267 | jnz L(return_vec_2) |
268 | .p2align 4,, 2 |
269 | L(return_vec_3): |
270 | /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one |
271 | fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache |
272 | line. */ |
273 | bsf %VRCX, %VRCX |
274 | # ifdef USE_AS_WMEMCMP |
275 | movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax |
276 | xorl %edx, %edx |
277 | cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax |
278 | setg %dl |
279 | leal -1(%rdx, %rdx), %eax |
280 | # else |
281 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
282 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx |
283 | subl %ecx, %eax |
284 | # endif |
285 | ret |
286 | |
287 | |
288 | .p2align 4,, 8 |
289 | L(return_vec_1): |
290 | /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one |
291 | fetch block. */ |
292 | bsf %VRAX, %VRAX |
293 | # ifdef USE_AS_WMEMCMP |
294 | movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx |
295 | xorl %edx, %edx |
296 | cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx |
297 | setg %dl |
298 | leal -1(%rdx, %rdx), %eax |
299 | # else |
300 | movzbl VEC_SIZE(%rsi, %rax), %ecx |
301 | movzbl VEC_SIZE(%rdi, %rax), %eax |
302 | subl %ecx, %eax |
303 | # endif |
304 | ret |
305 | |
306 | .p2align 4,, 7 |
307 | L(return_vec_2): |
308 | /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one |
309 | fetch block. */ |
310 | bsf %VRAX, %VRAX |
311 | # ifdef USE_AS_WMEMCMP |
312 | movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx |
313 | xorl %edx, %edx |
314 | cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx |
315 | setg %dl |
316 | leal -1(%rdx, %rdx), %eax |
317 | # else |
318 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
319 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
320 | subl %ecx, %eax |
321 | # endif |
322 | ret |
323 | |
324 | .p2align 4,, 8 |
325 | L(more_8x_vec): |
326 | /* Set end of s1 in rdx. */ |
327 | leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx |
328 | /* rsi stores s2 - s1. This allows loop to only update one |
329 | pointer. */ |
330 | subq %rdi, %rsi |
331 | /* Align s1 pointer. */ |
332 | andq $-VEC_SIZE, %rdi |
333 | /* Adjust because first 4x vec where check already. */ |
334 | subq $-(VEC_SIZE * 4), %rdi |
335 | |
336 | .p2align 4 |
337 | L(loop_4x_vec): |
338 | VMOVU (%rsi, %rdi), %VMM(1) |
339 | vpxorq (%rdi), %VMM(1), %VMM(1) |
340 | VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2) |
341 | vpxorq VEC_SIZE(%rdi), %VMM(2), %VMM(2) |
342 | VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3) |
343 | vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) |
344 | VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4) |
345 | vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4) |
346 | vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) |
347 | VPTEST %VMM(4), %VMM(4), %k1 |
348 | /* If VEC_SIZE == 64 just branch with KTEST. We have free port0 |
349 | space and it allows the loop to fit in 2x cache lines |
350 | instead of 3. */ |
351 | # if VEC_SIZE == 64 |
352 | KTEST %k1, %k1 |
353 | # else |
354 | KMOV %k1, %VRCX |
355 | test %VRCX, %VRCX |
356 | # endif |
357 | jnz L(8x_return_vec_0_1_2_3) |
358 | subq $-(VEC_SIZE * 4), %rdi |
359 | cmpq %rdx, %rdi |
360 | jb L(loop_4x_vec) |
361 | subq %rdx, %rdi |
362 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
363 | cmpl $(VEC_SIZE * 3), %edi |
364 | jge L(8x_last_1x_vec) |
365 | /* Load regardless of branch. */ |
366 | VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) |
367 | |
368 | /* Seperate logic as we can only use testb for VEC_SIZE == 64. |
369 | */ |
370 | # if VEC_SIZE == 64 |
371 | testb %dil, %dil |
372 | js L(8x_last_2x_vec) |
373 | # else |
374 | cmpl $(VEC_SIZE * 2), %edi |
375 | jge L(8x_last_2x_vec) |
376 | # endif |
377 | |
378 | vpxorq (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3) |
379 | |
380 | VMOVU (%rsi, %rdx), %VMM(1) |
381 | vpxorq (%rdx), %VMM(1), %VMM(1) |
382 | |
383 | VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2) |
384 | vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2) |
385 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4) |
386 | vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4) |
387 | vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) |
388 | VPTEST %VMM(4), %VMM(4), %k1 |
389 | /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in |
390 | `k1` if VEC_SIZE == 64. */ |
391 | # if VEC_SIZE == 64 |
392 | KTEST %k1, %k1 |
393 | # else |
394 | KMOV %k1, %VRCX |
395 | test %VRCX, %VRCX |
396 | # endif |
397 | jnz L(8x_end_return_vec_0_1_2_3) |
398 | /* NB: eax must be zero to reach here. */ |
399 | ret |
400 | |
401 | /* Only entry is from L(more_8x_vec). */ |
402 | .p2align 4,, 6 |
403 | L(8x_last_2x_vec): |
404 | VPCMP $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1 |
405 | KMOV %k1, %VRAX |
406 | test %VRAX, %VRAX |
407 | jnz L(8x_return_vec_2) |
408 | .p2align 4,, 5 |
409 | L(8x_last_1x_vec): |
410 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1) |
411 | VPCMP $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1 |
412 | KMOV %k1, %VRAX |
413 | test %VRAX, %VRAX |
414 | jnz L(8x_return_vec_3) |
415 | ret |
416 | |
417 | /* Not ideally aligned (at offset +9 bytes in fetch block) but |
418 | not aligning keeps it in the same cache line as |
419 | L(8x_last_1x/2x_vec) so likely worth it. As well, saves code |
420 | size. */ |
421 | .p2align 4,, 4 |
422 | L(8x_return_vec_2): |
423 | subq $VEC_SIZE, %rdx |
424 | L(8x_return_vec_3): |
425 | bsf %VRAX, %VRAX |
426 | # ifdef USE_AS_WMEMCMP |
427 | leaq (%rdx, %rax, CHAR_SIZE), %rax |
428 | movl (VEC_SIZE * 3)(%rax), %ecx |
429 | xorl %edx, %edx |
430 | cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
431 | setg %dl |
432 | leal -1(%rdx, %rdx), %eax |
433 | # else |
434 | addq %rdx, %rax |
435 | movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
436 | movzbl (VEC_SIZE * 3)(%rax), %eax |
437 | subl %ecx, %eax |
438 | # endif |
439 | ret |
440 | |
441 | .p2align 4,, 8 |
442 | L(last_2x_vec): |
443 | /* Check second to last VEC. */ |
444 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) |
445 | VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1 |
446 | KMOV %k1, %VRAX |
447 | test %VRAX, %VRAX |
448 | jnz L(return_vec_1_end) |
449 | |
450 | /* Check last VEC. */ |
451 | .p2align 4,, 8 |
452 | L(last_1x_vec): |
453 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1) |
454 | VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1 |
455 | KMOV %k1, %VRAX |
456 | test %VRAX, %VRAX |
457 | jnz L(return_vec_0_end) |
458 | ret |
459 | |
460 | |
461 | /* Don't fully align. Takes 2-fetch blocks either way and |
462 | aligning will cause code to spill into another cacheline. |
463 | */ |
464 | .p2align 4,, 3 |
465 | L(return_vec_1_end): |
466 | /* Use bsf to save code size. This is necessary to have |
467 | L(one_or_less) fit in aligning bytes between. */ |
468 | bsf %VRAX, %VRAX |
469 | addl %edx, %eax |
470 | # ifdef USE_AS_WMEMCMP |
471 | movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx |
472 | xorl %edx, %edx |
473 | cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx |
474 | setg %dl |
475 | leal -1(%rdx, %rdx), %eax |
476 | # else |
477 | movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx |
478 | movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax |
479 | subl %ecx, %eax |
480 | # endif |
481 | ret |
482 | |
483 | .p2align 4,, 2 |
484 | /* Don't align. Takes 2-fetch blocks either way and aligning |
485 | will cause code to spill into another cacheline. */ |
486 | L(return_vec_0_end): |
487 | bsf %VRAX, %VRAX |
488 | addl %edx, %eax |
489 | # ifdef USE_AS_WMEMCMP |
490 | movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx |
491 | xorl %edx, %edx |
492 | cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx |
493 | setg %dl |
494 | leal -1(%rdx, %rdx), %eax |
495 | # else |
496 | movzbl -VEC_SIZE(%rsi, %rax), %ecx |
497 | movzbl -VEC_SIZE(%rdi, %rax), %eax |
498 | subl %ecx, %eax |
499 | # endif |
500 | ret |
501 | /* evex256: 2-byte until next cache line. evex512: 46-bytes |
502 | until next cache line. */ |
503 | END (MEMCMP) |
504 | #endif |
505 | |