| 1 | /* Copyright (C) 2011-2016 Free Software Foundation, Inc. | 
| 2 |    Contributed by Intel Corporation. | 
| 3 |    This file is part of the GNU C Library. | 
| 4 |  | 
| 5 |    The GNU C Library is free software; you can redistribute it and/or | 
| 6 |    modify it under the terms of the GNU Lesser General Public | 
| 7 |    License as published by the Free Software Foundation; either | 
| 8 |    version 2.1 of the License, or (at your option) any later version. | 
| 9 |  | 
| 10 |    The GNU C Library is distributed in the hope that it will be useful, | 
| 11 |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 12 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 13 |    Lesser General Public License for more details. | 
| 14 |  | 
| 15 |    You should have received a copy of the GNU Lesser General Public | 
| 16 |    License along with the GNU C Library; if not, see | 
| 17 |    <http://www.gnu.org/licenses/>.  */ | 
| 18 |  | 
| 19 | #include <sysdep.h> | 
| 20 |  | 
| 21 | /* fast SSE2 version with using pmaxub and 64 byte loop */ | 
| 22 |  | 
| 23 | 	.text | 
| 24 | ENTRY(memchr) | 
| 25 | 	movd	%rsi, %xmm1 | 
| 26 | 	mov	%rdi, %rcx | 
| 27 |  | 
| 28 | 	punpcklbw %xmm1, %xmm1 | 
| 29 | 	test	%rdx, %rdx | 
| 30 | 	jz	L(return_null) | 
| 31 | 	punpcklbw %xmm1, %xmm1 | 
| 32 |  | 
| 33 | 	and	$63, %rcx | 
| 34 | 	pshufd	$0, %xmm1, %xmm1 | 
| 35 |  | 
| 36 | 	cmp	$48, %rcx | 
| 37 | 	ja	L(crosscache) | 
| 38 |  | 
| 39 | 	movdqu	(%rdi), %xmm0 | 
| 40 | 	pcmpeqb	%xmm1, %xmm0 | 
| 41 | 	pmovmskb %xmm0, %eax | 
| 42 | 	test	%eax, %eax | 
| 43 |  | 
| 44 | 	jnz	L(matches_1) | 
| 45 | 	sub	$16, %rdx | 
| 46 | 	jbe	L(return_null) | 
| 47 | 	add	$16, %rdi | 
| 48 | 	and	$15, %rcx | 
| 49 | 	and	$-16, %rdi | 
| 50 | 	add	%rcx, %rdx | 
| 51 | 	sub	$64, %rdx | 
| 52 | 	jbe	L(exit_loop) | 
| 53 | 	jmp	L(loop_prolog) | 
| 54 |  | 
| 55 | 	.p2align 4 | 
| 56 | L(crosscache): | 
| 57 | 	and	$15, %rcx | 
| 58 | 	and	$-16, %rdi | 
| 59 | 	movdqa	(%rdi), %xmm0 | 
| 60 |  | 
| 61 | 	pcmpeqb	%xmm1, %xmm0 | 
| 62 | /* Check if there is a match.  */ | 
| 63 | 	pmovmskb %xmm0, %eax | 
| 64 | /* Remove the leading bytes.  */ | 
| 65 | 	sar	%cl, %eax | 
| 66 | 	test	%eax, %eax | 
| 67 | 	je	L(unaligned_no_match) | 
| 68 | /* Check which byte is a match.  */ | 
| 69 | 	bsf	%eax, %eax | 
| 70 |  | 
| 71 | 	sub	%rax, %rdx | 
| 72 | 	jbe	L(return_null) | 
| 73 | 	add	%rdi, %rax | 
| 74 | 	add	%rcx, %rax | 
| 75 | 	ret | 
| 76 |  | 
| 77 | 	.p2align 4 | 
| 78 | L(unaligned_no_match): | 
| 79 | 	add	%rcx, %rdx | 
| 80 | 	sub	$16, %rdx | 
| 81 | 	jbe	L(return_null) | 
| 82 | 	add	$16, %rdi | 
| 83 | 	sub	$64, %rdx | 
| 84 | 	jbe	L(exit_loop) | 
| 85 |  | 
| 86 | 	.p2align 4 | 
| 87 | L(loop_prolog): | 
| 88 | 	movdqa	(%rdi), %xmm0 | 
| 89 | 	pcmpeqb	%xmm1, %xmm0 | 
| 90 | 	pmovmskb %xmm0, %eax | 
| 91 | 	test	%eax, %eax | 
| 92 | 	jnz	L(matches) | 
| 93 |  | 
| 94 | 	movdqa	16(%rdi), %xmm2 | 
| 95 | 	pcmpeqb	%xmm1, %xmm2 | 
| 96 | 	pmovmskb %xmm2, %eax | 
| 97 | 	test	%eax, %eax | 
| 98 | 	jnz	L(matches16) | 
| 99 |  | 
| 100 | 	movdqa	32(%rdi), %xmm3 | 
| 101 | 	pcmpeqb	%xmm1, %xmm3 | 
| 102 | 	pmovmskb %xmm3, %eax | 
| 103 | 	test	%eax, %eax | 
| 104 | 	jnz	L(matches32) | 
| 105 |  | 
| 106 | 	movdqa	48(%rdi), %xmm4 | 
| 107 | 	pcmpeqb	%xmm1, %xmm4 | 
| 108 | 	add	$64, %rdi | 
| 109 | 	pmovmskb %xmm4, %eax | 
| 110 | 	test	%eax, %eax | 
| 111 | 	jnz	L(matches0) | 
| 112 |  | 
| 113 | 	test	$0x3f, %rdi | 
| 114 | 	jz	L(align64_loop) | 
| 115 |  | 
| 116 | 	sub	$64, %rdx | 
| 117 | 	jbe	L(exit_loop) | 
| 118 |  | 
| 119 | 	movdqa	(%rdi), %xmm0 | 
| 120 | 	pcmpeqb	%xmm1, %xmm0 | 
| 121 | 	pmovmskb %xmm0, %eax | 
| 122 | 	test	%eax, %eax | 
| 123 | 	jnz	L(matches) | 
| 124 |  | 
| 125 | 	movdqa	16(%rdi), %xmm2 | 
| 126 | 	pcmpeqb	%xmm1, %xmm2 | 
| 127 | 	pmovmskb %xmm2, %eax | 
| 128 | 	test	%eax, %eax | 
| 129 | 	jnz	L(matches16) | 
| 130 |  | 
| 131 | 	movdqa	32(%rdi), %xmm3 | 
| 132 | 	pcmpeqb	%xmm1, %xmm3 | 
| 133 | 	pmovmskb %xmm3, %eax | 
| 134 | 	test	%eax, %eax | 
| 135 | 	jnz	L(matches32) | 
| 136 |  | 
| 137 | 	movdqa	48(%rdi), %xmm3 | 
| 138 | 	pcmpeqb	%xmm1, %xmm3 | 
| 139 | 	pmovmskb %xmm3, %eax | 
| 140 |  | 
| 141 | 	add	$64, %rdi | 
| 142 | 	test	%eax, %eax | 
| 143 | 	jnz	L(matches0) | 
| 144 |  | 
| 145 | 	mov	%rdi, %rcx | 
| 146 | 	and	$-64, %rdi | 
| 147 | 	and	$63, %rcx | 
| 148 | 	add	%rcx, %rdx | 
| 149 |  | 
| 150 | 	.p2align 4 | 
| 151 | L(align64_loop): | 
| 152 | 	sub	$64, %rdx | 
| 153 | 	jbe	L(exit_loop) | 
| 154 | 	movdqa	(%rdi), %xmm0 | 
| 155 | 	movdqa	16(%rdi), %xmm2 | 
| 156 | 	movdqa	32(%rdi), %xmm3 | 
| 157 | 	movdqa	48(%rdi), %xmm4 | 
| 158 |  | 
| 159 | 	pcmpeqb	%xmm1, %xmm0 | 
| 160 | 	pcmpeqb	%xmm1, %xmm2 | 
| 161 | 	pcmpeqb	%xmm1, %xmm3 | 
| 162 | 	pcmpeqb	%xmm1, %xmm4 | 
| 163 |  | 
| 164 | 	pmaxub	%xmm0, %xmm3 | 
| 165 | 	pmaxub	%xmm2, %xmm4 | 
| 166 | 	pmaxub	%xmm3, %xmm4 | 
| 167 | 	pmovmskb %xmm4, %eax | 
| 168 |  | 
| 169 | 	add	$64, %rdi | 
| 170 |  | 
| 171 | 	test	%eax, %eax | 
| 172 | 	jz	L(align64_loop) | 
| 173 |  | 
| 174 | 	sub	$64, %rdi | 
| 175 |  | 
| 176 | 	pmovmskb %xmm0, %eax | 
| 177 | 	test	%eax, %eax | 
| 178 | 	jnz	L(matches) | 
| 179 |  | 
| 180 | 	pmovmskb %xmm2, %eax | 
| 181 | 	test	%eax, %eax | 
| 182 | 	jnz	L(matches16) | 
| 183 |  | 
| 184 | 	movdqa	32(%rdi), %xmm3 | 
| 185 | 	pcmpeqb	%xmm1, %xmm3 | 
| 186 |  | 
| 187 | 	pcmpeqb	48(%rdi), %xmm1 | 
| 188 | 	pmovmskb %xmm3, %eax | 
| 189 | 	test	%eax, %eax | 
| 190 | 	jnz	L(matches32) | 
| 191 |  | 
| 192 | 	pmovmskb %xmm1, %eax | 
| 193 | 	bsf	%eax, %eax | 
| 194 | 	lea	48(%rdi, %rax), %rax | 
| 195 | 	ret | 
| 196 |  | 
| 197 | 	.p2align 4 | 
| 198 | L(exit_loop): | 
| 199 | 	add	$32, %rdx | 
| 200 | 	jle	L(exit_loop_32) | 
| 201 |  | 
| 202 | 	movdqa	(%rdi), %xmm0 | 
| 203 | 	pcmpeqb	%xmm1, %xmm0 | 
| 204 | 	pmovmskb %xmm0, %eax | 
| 205 | 	test	%eax, %eax | 
| 206 | 	jnz	L(matches) | 
| 207 |  | 
| 208 | 	movdqa	16(%rdi), %xmm2 | 
| 209 | 	pcmpeqb	%xmm1, %xmm2 | 
| 210 | 	pmovmskb %xmm2, %eax | 
| 211 | 	test	%eax, %eax | 
| 212 | 	jnz	L(matches16) | 
| 213 |  | 
| 214 | 	movdqa	32(%rdi), %xmm3 | 
| 215 | 	pcmpeqb	%xmm1, %xmm3 | 
| 216 | 	pmovmskb %xmm3, %eax | 
| 217 | 	test	%eax, %eax | 
| 218 | 	jnz	L(matches32_1) | 
| 219 | 	sub	$16, %rdx | 
| 220 | 	jle	L(return_null) | 
| 221 |  | 
| 222 | 	pcmpeqb	48(%rdi), %xmm1 | 
| 223 | 	pmovmskb %xmm1, %eax | 
| 224 | 	test	%eax, %eax | 
| 225 | 	jnz	L(matches48_1) | 
| 226 | 	xor	%rax, %rax | 
| 227 | 	ret | 
| 228 |  | 
| 229 | 	.p2align 4 | 
| 230 | L(exit_loop_32): | 
| 231 | 	add	$32, %rdx | 
| 232 | 	movdqa	(%rdi), %xmm0 | 
| 233 | 	pcmpeqb	%xmm1, %xmm0 | 
| 234 | 	pmovmskb %xmm0, %eax | 
| 235 | 	test	%eax, %eax | 
| 236 | 	jnz	L(matches_1) | 
| 237 | 	sub	$16, %rdx | 
| 238 | 	jbe	L(return_null) | 
| 239 |  | 
| 240 | 	pcmpeqb	16(%rdi), %xmm1 | 
| 241 | 	pmovmskb %xmm1, %eax | 
| 242 | 	test	%eax, %eax | 
| 243 | 	jnz	L(matches16_1) | 
| 244 | 	xor	%rax, %rax | 
| 245 | 	ret | 
| 246 |  | 
| 247 | 	.p2align 4 | 
| 248 | L(matches0): | 
| 249 | 	bsf	%eax, %eax | 
| 250 | 	lea	-16(%rax, %rdi), %rax | 
| 251 | 	ret | 
| 252 |  | 
| 253 | 	.p2align 4 | 
| 254 | L(matches): | 
| 255 | 	bsf	%eax, %eax | 
| 256 | 	add	%rdi, %rax | 
| 257 | 	ret | 
| 258 |  | 
| 259 | 	.p2align 4 | 
| 260 | L(matches16): | 
| 261 | 	bsf	%eax, %eax | 
| 262 | 	lea	16(%rax, %rdi), %rax | 
| 263 | 	ret | 
| 264 |  | 
| 265 | 	.p2align 4 | 
| 266 | L(matches32): | 
| 267 | 	bsf	%eax, %eax | 
| 268 | 	lea	32(%rax, %rdi), %rax | 
| 269 | 	ret | 
| 270 |  | 
| 271 | 	.p2align 4 | 
| 272 | L(matches_1): | 
| 273 | 	bsf	%eax, %eax | 
| 274 | 	sub	%rax, %rdx | 
| 275 | 	jbe	L(return_null) | 
| 276 | 	add	%rdi, %rax | 
| 277 | 	ret | 
| 278 |  | 
| 279 | 	.p2align 4 | 
| 280 | L(matches16_1): | 
| 281 | 	bsf	%eax, %eax | 
| 282 | 	sub	%rax, %rdx | 
| 283 | 	jbe	L(return_null) | 
| 284 | 	lea	16(%rdi, %rax), %rax | 
| 285 | 	ret | 
| 286 |  | 
| 287 | 	.p2align 4 | 
| 288 | L(matches32_1): | 
| 289 | 	bsf	%eax, %eax | 
| 290 | 	sub	%rax, %rdx | 
| 291 | 	jbe	L(return_null) | 
| 292 | 	lea	32(%rdi, %rax), %rax | 
| 293 | 	ret | 
| 294 |  | 
| 295 | 	.p2align 4 | 
| 296 | L(matches48_1): | 
| 297 | 	bsf	%eax, %eax | 
| 298 | 	sub	%rax, %rdx | 
| 299 | 	jbe	L(return_null) | 
| 300 | 	lea	48(%rdi, %rax), %rax | 
| 301 | 	ret | 
| 302 |  | 
| 303 | 	.p2align 4 | 
| 304 | L(return_null): | 
| 305 | 	xor	%rax, %rax | 
| 306 | 	ret | 
| 307 | END(memchr) | 
| 308 |  | 
| 309 | strong_alias (memchr, __memchr) | 
| 310 |  | 
| 311 | libc_hidden_builtin_def(memchr) | 
| 312 |  |